In [7]:
# Load the dataset (dataframe; DF), apply first measures to reduce the quantity of document entities (DE)

import pandas as pd
from xml.etree import ElementTree as ET

# Load the dataset
path = r'/home/jovyan/sema/python/data/wiki_movie_plots_deduped.csv'
df = pd.read_csv(path)

# First stage of reduction: eliminate unknown and NaN values, only use DE from 2000 onwards
res = df[
            (df['Director'] != 'Unknown') & 
            (df['Cast'] != 'NaN') & 
            (df['Genre'] != 'unknown') & 
            (df['Release Year'] >= 2000)
]

# Make index column
res.reset_index(drop=True, inplace=True)
res.reset_index(inplace=True)

In [8]:
# Get to know the remaining DE
# Print keys of DF
keys = list(df.keys())
print(f'Keys: {", ".join(keys)}')

# Print # of DE
print(f'Number of DE: {len(res)}')

Keys: Release Year, Title, Origin/Ethnicity, Director, Cast, Genre, Wiki Page, Plot
Number of DE: 9975


In [9]:
# Get count of movies (#m) by release year
print(res['Release Year'].value_counts().sort_index())

Release Year
2000    372
2001    350
2002    363
2003    349
2004    439
2005    424
2006    627
2007    570
2008    552
2009    659
2010    662
2011    691
2012    693
2013    817
2014    712
2015    531
2016    558
2017    606
Name: count, dtype: int64


In [10]:
# Get #m by origin/ethnicity
e = res['Origin/Ethnicity']

mov_by_eth = e.value_counts()
print(f'#Movies by Origin/Ethnicity: \n{mov_by_eth}')

#Movies by Origin/Ethnicity: 
Origin/Ethnicity
American       3744
Bollywood       956
Tamil           777
Telugu          706
Malayalam       546
Japanese        464
Canadian        439
Chinese         411
British         375
Bengali         250
Australian      236
Kannada         209
Hong Kong       205
Russian         152
Marathi         137
Filipino        128
Bangladeshi      72
Punjabi          65
Turkish          59
Malaysian        33
Assamese          9
Maldivian         2
Name: count, dtype: int64


In [11]:
# Get #m by director
d = res['Director'].str.replace('/', ',')

dir_with_slash = d[d.str.contains('/')]
assert \
    len(dir_with_slash) == 0, \
    f"There are still lists of directors, separated with a '/' instead of ','!"

mov_by_director = d.value_counts()
print(f'#Movies by Director: \n{mov_by_director}')

#Movies by Director: 
Director
Puri Jagannadh             24
Priyadarshan               20
Ridley Scott               15
Tyler Perry                15
Ram Gopal Varma            15
                           ..
Ma Yuan, Dong Dake          1
Adrian Kwan                 1
Peng Sanyuan                1
Will Speck, Josh Gordon     1
Ferzan Özpetek              1
Name: count, Length: 6023, dtype: int64


In [12]:
# Get #m by genre
g = res['Genre'].str.replace('/', ',')

genr_with_slash = g[g.str.contains('/')]
assert \
    len(genr_with_slash) == 0, \
    f"There are still lists of genres, separated with a '/' instead of ','!"

mov_by_genre = g.value_counts()
print(f'#Movies by Genre: \n{mov_by_genre}')

#Movies by Genre: 
Genre
drama                               1642
comedy                              1186
romance                              592
action                               542
thriller                             450
                                    ... 
drama , comedy , action                1
drama , animation , fantasy            1
comedy , adventure                     1
animation , music                      1
adventure, romance, fantasy film       1
Name: count, Length: 1574, dtype: int64


In [20]:
'''
Decision upon how to reduce the set to 5000 DE and of what measures to introduce before export:
- Only consider plots of movies released in and after the year 2000
- Further reduction of plots by a factor of 0.5, respecting the movies per origin/ethnicity proportion
- Replace '/'-separators in lists of directors and genres by ','-separators
'''
# Work on a copy, as pandas throws errors otherwise
res_copy = res.copy() 

# Replace '/'-separators in lists of directors and genres by ','-separators
# Assert that replacements have taken place
res_copy['Genre'] = res_copy['Genre'].str.replace('/', ',')
res_copy['Director'] = res_copy['Director'].str.replace('/', ',')

genr_w_slash = res_copy['Genre'][res_copy['Genre'].str.contains('/')]
assert \
    len(genr_w_slash) == 0, \
    f"There are still lists of genres, separated with a '/' instead of ','!"

dir_w_slash = res_copy['Director'][res_copy['Director'].str.contains('/')]
assert \
    len(dir_w_slash) == 0, \
    f"There are still lists of directors, separated with a '/' instead of ','!"

# Extend the dataframe with a new columns holding the count of movies by origin/ethnicity
# The 'Eth_Counts'-column will be used as the weight for the sample
res_copy['Eth_Counts'] = e.map(e.value_counts())

# Get a sample of 5000 DE for the search set
search_set = res_copy.sample(n=5000, weights='Eth_Counts', ignore_index=True).drop(columns=['Eth_Counts'])
print(search_set)

# Get a sample of 10 DE for the golden set
golden_set = res_copy.sample(n=10, weights='Eth_Counts', ignore_index=True).drop(columns=['Eth_Counts'])
# print(golden_set[['id', 'title', 'origin']])

def setProperties(parent, data, name):
    child = ET.SubElement(parent, 'field')
    child.text = data
    child.attrib = {'name':name}

def xmlFromDf(df):
    print(len(df))
    i = 0
    add = ET.Element('add')
    for ind, row in df.iterrows():
        doc = ET.SubElement(add, 'doc')
        f_id = setProperties(doc, str(row['index']), 'id')
        f_rel = setProperties(doc, str(row['Release Year']), 'release')
        f_title = setProperties(doc, str(row['Title']), 'title')
        f_origin = setProperties(doc, str(row['Origin/Ethnicity']), 'origin')
        f_director = setProperties(doc, str(row['Director']), 'director')
        f_cast = setProperties(doc, str(row['Cast']), 'cast')
        f_genre = setProperties(doc, str(row['Genre']), 'genre')
        f_wikipage = setProperties(doc, str(row['Wiki Page']), 'wikipage')
        f_plot = setProperties(doc, str(row['Plot']), 'plot')
        i += 1
    
    print(i)
    return add

def dfToXmlExport(df, path):
    test = df.reset_index(drop=True, inplace=false)
    test.reset_index(inplace=True)
    print(test)
    
    add = xmlFromDf(df)
    print(add)
    tree = ET.ElementTree(add)
    ET.indent(tree, space="\t", level=0)
    # tree.write(path, xml_declaration=False, encoding='utf-8', method='xml')

# Create XML for the search and for the golden set and export to fs
dfToXmlExport(search_set, './data/search_set.xml')
# dfToXmlExport(golden_set, './data/golden_set.xml')

      index  Release Year                 Title Origin/Ethnicity   
0      2480          2011           Margin Call         American  \
1      3212          2014      Ping Pong Summer         American   
2      2549          2011       We Bought a Zoo         American   
3      6719          2014                Haider        Bollywood   
4      3196          2014         Walk of Shame         American   
...     ...           ...                   ...              ...   
4995   6443          2011     Pyar ka Punchnama        Bollywood   
4996   1677          2007         Lust, Caution         American   
4997   2216          2010            Date Night         American   
4998   3104          2013  Thor: The Dark World         American   
4999   8386          2015          Kaaka Muttai            Tamil   

             Director                                               Cast   
0        J.C. Chandor  Kevin Spacey, Paul Bettany, Jeremy Irons, Zach...  \
1       Michael Tully  Susan Sa

NameError: name 'false' is not defined

In [None]:
'''
import pandas as pd
from xml.etree import ElementTree as ET

# Load the dataset
path = r'/home/jovyan/work/data/golden_set_original.json'
df = pd.read_json(path)

def setProperties(child, data, name, boost=1.0):
    child.text = data
    child.attrib = {'name':name, 'boost':boost}

def xmlFromDf(df):
    add = ET.Element('add')
    for ind, row in df.iterrows():
        doc = ET.SubElement(add, 'doc')
    
        f_id = ET.SubElement(doc, 'field')
        setProperties(f_id, str(row['id']), 'id', '1.0')
    
        f_rel = ET.SubElement(doc, 'field')
        setProperties(f_rel, str(row['release']), 'release', '1.0')

        f_title = ET.SubElement(doc, 'field')
        setProperties(f_title, str(row['title']), 'title', '4.0')

        f_origin = ET.SubElement(doc, 'field')
        setProperties(f_origin, str(row['origin']), 'origin', '1.0')

        f_director = ET.SubElement(doc, 'field')
        setProperties(f_director, str(row['director']), 'director', '1.0')

        f_cast = ET.SubElement(doc, 'field')
        setProperties(f_cast, str(row['cast']), 'cast', '1.0')
        
        f_genre = ET.SubElement(doc, 'field')
        setProperties(f_genre, str(row['genre']), 'genre', '1.0')

        f_wikipage = ET.SubElement(doc, 'field')
        setProperties(f_wikipage, str(row['wikipage']), 'wikipage', '1.0')

        f_plot = ET.SubElement(doc, 'field')
        setProperties(f_plot, str(row['plot']), 'plot', '2.0')    
    return add

add = xmlFromDf(df)

# print(ET.tostring(add))
tree = ET.ElementTree(add)
ET.indent(tree, space="\t", level=0)
tree.write('./data/golden_set.xml', xml_declaration=False, encoding='utf-8', method='xml')
'''