In [None]:
# Load the dataset (dataframe; DF), apply first measures to reduce the quantity of document entities (DE)

import pandas as pd
from xml.etree import ElementTree as ET

# Load the dataset
path = r'/home/jovyan/work/data/wiki_movie_plots_deduped.csv'
df = pd.read_csv(path)

# First stage of reduction: eliminate unknown and NaN values, only use DE from 2000 onwards
res = df[
            (df['Director'] != 'Unknown') & 
            (df['Cast'] != 'NaN') & 
            (df['Genre'] != 'unknown') & 
            (df['Release Year'] >= 2000)
]

# Make index column
res.reset_index(drop=True, inplace=True)
res.reset_index(inplace=True)

In [None]:
# Get to know the remaining DE
# Print keys of DF
keys = list(df.keys())
print(f'Keys: {", ".join(keys)}')

# Print # of DE
print(f'Number of DE: {len(res)}')

In [None]:
# Get count of movies (#m) by release year
print(res['Release Year'].value_counts().sort_index())

In [None]:
# Get #m by origin/ethnicity
e = res['Origin/Ethnicity']

mov_by_eth = e.value_counts()
print(f'#Movies by Origin/Ethnicity: \n{mov_by_eth}')

In [None]:
# Get #m by director
d = res['Director'].str.replace('/', ',')

dir_with_slash = d[d.str.contains('/')]
assert \
    len(dir_with_slash) == 0, \
    f"There are still lists of directors, separated with a '/' instead of ','!"

mov_by_director = d.value_counts()
print(f'#Movies by Director: \n{mov_by_director}')

In [None]:
# Get #m by genre
g = res['Genre'].str.replace('/', ',')

genr_with_slash = g[g.str.contains('/')]
assert \
    len(genr_with_slash) == 0, \
    f"There are still lists of genres, separated with a '/' instead of ','!"

mov_by_genre = g.value_counts()
print(f'#Movies by Genre: \n{mov_by_genre}')

In [None]:
'''
Decision upon how to reduce the set to 5000 DE and of what measures to introduce before export:
- Only consider plots of movies released in and after the year 2000
- Further reduction of plots by a factor of 0.5, respecting the movies per origin/ethnicity proportion
- Replace '/'-separators in lists of directors and genres by ','-separators
'''
# Work on a copy, as pandas throws errors otherwise
res_copy = res.copy() 

# Replace '/'-separators in lists of directors and genres by ','-separators
# Assert that replacements have taken place
res_copy['Genre'] = res_copy['Genre'].str.replace('/', ',')
res_copy['Director'] = res_copy['Director'].str.replace('/', ',')

genr_w_slash = res_copy['Genre'][res_copy['Genre'].str.contains('/')]
assert \
    len(genr_w_slash) == 0, \
    f"There are still lists of genres, separated with a '/' instead of ','!"

dir_w_slash = res_copy['Director'][res_copy['Director'].str.contains('/')]
assert \
    len(dir_w_slash) == 0, \
    f"There are still lists of directors, separated with a '/' instead of ','!"

# Extend the dataframe with a new columns holding the count of movies by origin/ethnicity
# The 'Eth_Counts'-column will be used as the weight for the sample
res_copy['Eth_Counts'] = e.map(e.value_counts())

# Get a sample of 5000 DE for the search set
search_set = res_copy.sample(n=5000, weights='Eth_Counts', ignore_index=True).drop(columns=['Eth_Counts'])

# Get a sample of 10 DE for the golden set
golden_set = res_copy.sample(n=10, weights='Eth_Counts', ignore_index=True).drop(columns=['Eth_Counts'])
# print(golden_set[['id', 'title', 'origin']])

def setProperties(parent, data, name, boost=1.0):
    child = ET.SubElement(parent, 'field')
    child.text = data
    child.attrib = {'name':name, 'boost':boost}

def xmlFromDf(df):
    print(len(df))
    i = 0
    add = ET.Element('add')
    for ind, row in df.iterrows():
        doc = ET.SubElement(add, 'doc')
        f_id = setProperties(doc, str(row['index']), 'id', '1.0')
        f_rel = setProperties(doc, str(row['Release Year']), 'release', '1.0')
        f_title = setProperties(doc, str(row['Title']), 'title', '4.0')
        f_origin = setProperties(doc, str(row['Origin/Ethnicity']), 'origin', '1.0')
        f_director = setProperties(doc, str(row['Director']), 'director', '1.0')
        f_cast = setProperties(doc, str(row['Cast']), 'cast', '1.0')
        f_genre = setProperties(doc, str(row['Genre']), 'genre', '1.0')
        f_wikipage = setProperties(doc, str(row['Wiki Page']), 'wikipage', '1.0')
        f_plot = setProperties(doc, str(row['Plot']), 'plot', '2.0')
        i += 1
    
    print(i)
    return add

def dfToXmlExport(df, path):
    add = xmlFromDf(df)
    tree = ET.ElementTree(add)
    ET.indent(tree, space="\t", level=0)
    tree.write(path, xml_declaration=False, encoding='utf-8', method='xml')

# Create XML for the search and for the golden set and export to fs
dfToXmlExport(search_set, './data/search_set.xml')
# dfToXmlExport(golden_set, './data/golden_set.xml')

In [None]:
'''
import pandas as pd
from xml.etree import ElementTree as ET

# Load the dataset
path = r'/home/jovyan/work/data/golden_set_original.json'
df = pd.read_json(path)

def setProperties(child, data, name, boost=1.0):
    child.text = data
    child.attrib = {'name':name, 'boost':boost}

def xmlFromDf(df):
    add = ET.Element('add')
    for ind, row in df.iterrows():
        doc = ET.SubElement(add, 'doc')
    
        f_id = ET.SubElement(doc, 'field')
        setProperties(f_id, str(row['id']), 'id', '1.0')
    
        f_rel = ET.SubElement(doc, 'field')
        setProperties(f_rel, str(row['release']), 'release', '1.0')

        f_title = ET.SubElement(doc, 'field')
        setProperties(f_title, str(row['title']), 'title', '4.0')

        f_origin = ET.SubElement(doc, 'field')
        setProperties(f_origin, str(row['origin']), 'origin', '1.0')

        f_director = ET.SubElement(doc, 'field')
        setProperties(f_director, str(row['director']), 'director', '1.0')

        f_cast = ET.SubElement(doc, 'field')
        setProperties(f_cast, str(row['cast']), 'cast', '1.0')
        
        f_genre = ET.SubElement(doc, 'field')
        setProperties(f_genre, str(row['genre']), 'genre', '1.0')

        f_wikipage = ET.SubElement(doc, 'field')
        setProperties(f_wikipage, str(row['wikipage']), 'wikipage', '1.0')

        f_plot = ET.SubElement(doc, 'field')
        setProperties(f_plot, str(row['plot']), 'plot', '2.0')    
    return add

add = xmlFromDf(df)

# print(ET.tostring(add))
tree = ET.ElementTree(add)
ET.indent(tree, space="\t", level=0)
tree.write('./data/golden_set.xml', xml_declaration=False, encoding='utf-8', method='xml')
'''