In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
from collections import Counter
import statistics as stat

## Own functions
import _functions_1115 as func

## Stuff to do:
- Convert all fields to the best possible data type
- Remove unwanted headers: budget, revenue, unnamed, runtime
- Convert string representation of lists to list. Assign 3 new columns for each list
    - Headers in question are producers, screenplay, starring_casts, genre_id (4 headers)

In [2]:
## Import dirty data
data_raw = pd.read_csv('./data/check1.csv')
data_raw.head()

Unnamed: 0.1,Unnamed: 0,tmdb_id,imdb_id,year_film,year_ceremony,film,oscar_category,nominee,no_oscar_nominations,oscar_win,...,casts_popularity,director,producers,screenplay,crew_popularity,budget,revenue,runtime,tmdb_vote_average,tmdb_vote_count
0,0,53939,tt0054326,1960,1961,Sons and Lovers,Best Actor,Trevor Howard,7,False,...,21.268,Jack Cardiff,[],['Gavin Lambert'],6.253,0,0,103,6.9,18
1,1,22013,tt0053793,1960,1961,Elmer Gantry,Best Actor,Burt Lancaster,5,True,...,39.177,Richard Brooks,['Bernard Smith'],['Richard Brooks'],15.158,3000000,10400000,146,7.2,85
2,2,284,tt0053604,1960,1961,The Apartment,Best Actor,Jack Lemmon,10,False,...,42.158,Billy Wilder,['Billy Wilder'],[],40.091,3000000,25000000,125,8.2,1429
3,3,18929,tt0053796,1960,1961,The Entertainer,Best Actor,Laurence Olivier,1,False,...,37.9,Tony Richardson,['Harry Saltzman'],"['John Osborne', 'Nigel Kneale']",31.632,0,0,96,6.2,28
4,4,1908,tt0053946,1960,1961,Inherit the Wind,Best Actor,Spencer Tracy,4,False,...,73.32,Stanley Kramer,"['Stanley Kramer', 'Herman Shumlin']","['Nedrick Young', 'Harold Jacob Smith']",21.135,0,0,128,7.8,252


In [3]:
## from now on, data_cleaning will be used as a temp variable when changing the contents of data_raw
## data_cleaning will be continuously overwritten until arriving at final cleaned data
data_cleaning = data_raw.copy(deep=True)

In [4]:
## remove unwanted headers - those that serve no purpose in movie identification/machine learning
data_cleaning.drop(['Unnamed: 0','budget','revenue','runtime'], axis=1, inplace=True)

In [5]:
## convert to the best fit datatype
data_cleaning = data_cleaning.convert_dtypes()

In [6]:
## rename some headers 
data_cleaning.rename({
    'year_film':'year',
    'oscar_category':'oscar_cat',
    'no_oscar_nominations':'nomination_count',
    'staring_casts':'starring',
}, axis=1, inplace=True)
data_cleaning.head()

Unnamed: 0,tmdb_id,imdb_id,year,year_ceremony,film,oscar_cat,nominee,nomination_count,oscar_win,genre_id,starring,casts_popularity,director,producers,screenplay,crew_popularity,tmdb_vote_average,tmdb_vote_count
0,53939,tt0054326,1960,1961,Sons and Lovers,Best Actor,Trevor Howard,7,False,[18],"['Trevor Howard', 'Dean Stockwell', 'Wendy Hil...",21.268,Jack Cardiff,[],['Gavin Lambert'],6.253,6.9,18
1,22013,tt0053793,1960,1961,Elmer Gantry,Best Actor,Burt Lancaster,5,True,[18],"['Burt Lancaster', 'Jean Simmons', 'Arthur Ken...",39.177,Richard Brooks,['Bernard Smith'],['Richard Brooks'],15.158,7.2,85
2,284,tt0053604,1960,1961,The Apartment,Best Actor,Jack Lemmon,10,False,"[35, 18, 10749]","['Jack Lemmon', 'Shirley MacLaine', 'Fred MacM...",42.158,Billy Wilder,['Billy Wilder'],[],40.091,8.2,1429
3,18929,tt0053796,1960,1961,The Entertainer,Best Actor,Laurence Olivier,1,False,"[18, 10402]","['Laurence Olivier', 'Brenda De Banzie', 'Roge...",37.9,Tony Richardson,['Harry Saltzman'],"['John Osborne', 'Nigel Kneale']",31.632,6.2,28
4,1908,tt0053946,1960,1961,Inherit the Wind,Best Actor,Spencer Tracy,4,False,"[18, 36]","['Spencer Tracy', 'Fredric March', 'Gene Kelly...",73.32,Stanley Kramer,"['Stanley Kramer', 'Herman Shumlin']","['Nedrick Young', 'Harold Jacob Smith']",21.135,7.8,252


In [7]:
## Checking the number of cols needed for each of the following fields
for col in ['genre_id','starring','producers','screenplay']:
    #check the number of genre_ids in each row
    temp_list = []
    
    ## append length of each list 
    for entry in data_cleaning[col]:
        #print(func.strtolist(entry))
        temp_list.append(len(func.strtolist(entry)))

    print('''Stats for number of genres in column '%s':
    Mean:   %d
    Median: %d
    Mode:   %d
    Min:    %d
    Max:    %d\n''' % 
    (col, stat.mean(temp_list), stat.median(temp_list), stat.mode(temp_list), min(temp_list), max(temp_list)))

Stats for number of genres in column 'genre_id':
    Mean:   2
    Median: 2
    Mode:   2
    Min:    0
    Max:    5

Stats for number of genres in column 'starring':
    Mean:   3
    Median: 4
    Mode:   4
    Min:    1
    Max:    5

Stats for number of genres in column 'producers':
    Mean:   2
    Median: 2
    Mode:   1
    Min:    0
    Max:    10

Stats for number of genres in column 'screenplay':
    Mean:   1
    Median: 1
    Mode:   1
    Min:    0
    Max:    5



In [8]:
## creating new col for each item in list, based on the cumuative stats per col above
## taking the median no. of items in a list
new_headers = {
    'genre_id':['genre_id_0','genre_id_1'], 
    'starring':['starring_0','starring_1','starring_2','starring_3'],
    'producers':['producer_0','producer_1'],
    'screenplay':['screenplay_0']
}

## replacing cols with string representation of list to an actual list
for item in new_headers:
    data_cleaning[item] = [func.strtolist(data_cleaning[item][index]) for index in range(len(data_cleaning))]

In [9]:
## WORKING
## What this does is take each existing col in the new_headers dictionary, and expands out to multiple sub columns
## This is because the existing col contains a list that is not useful when doing machine learning

## look for current col that exists in 
for currheader in new_headers:
    ## look for target subcols, check if they exist first
    if currheader in data_cleaning:
        ## perform addition of new cols from currheader list containing only 1 entry, NAN if list index out of range
        for subcol in range(len(new_headers[currheader])):
            
            tmp_list = []
            for i in range(len(data_cleaning)):
                ## checks if list index still in range
                try:
                    tmp_list.append(data_cleaning[currheader][i][subcol])
                ## adds NAN if out of range
                except:
                    tmp_list.append(None)

            data_cleaning[new_headers[currheader][subcol]] = tmp_list
            
data_cleaning = data_cleaning.convert_dtypes()

In [10]:
## removing even more cols, including the cols containing lists of str/int (cause these elements are added to multiple subcols already)
data_cleaning.drop([
    'year_ceremony','nominee', 'genre_id','starring','producers','screenplay'
], axis=1, inplace=True)

In [11]:
## reordering the cols for clarity
col_list = data_cleaning.columns.tolist()
#display(col_list)

col_list_reorder = [
    'tmdb_id',
    'imdb_id',
    'film',
    'year',
    'nomination_count',
    'oscar_cat',
    'oscar_win',
    'casts_popularity',
    'crew_popularity',
    'tmdb_vote_average',
    'tmdb_vote_count',
    'director',
    'genre_id_0',
    'genre_id_1',
    'starring_0',
    'starring_1',
    'starring_2',
    'starring_3',
    'producer_0',
    'producer_1',
    'screenplay_0'
]
data_cleaning = data_cleaning[col_list_reorder]
data_cleaning.info()

['tmdb_id',
 'imdb_id',
 'year',
 'film',
 'oscar_cat',
 'nomination_count',
 'oscar_win',
 'casts_popularity',
 'director',
 'crew_popularity',
 'tmdb_vote_average',
 'tmdb_vote_count',
 'genre_id_0',
 'genre_id_1',
 'starring_0',
 'starring_1',
 'starring_2',
 'starring_3',
 'producer_0',
 'producer_1',
 'screenplay_0']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1838 entries, 0 to 1837
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   tmdb_id            1838 non-null   Int64  
 1   imdb_id            1838 non-null   string 
 2   film               1838 non-null   string 
 3   year               1838 non-null   Int64  
 4   nomination_count   1838 non-null   Int64  
 5   oscar_cat          1838 non-null   string 
 6   oscar_win          1838 non-null   boolean
 7   casts_popularity   1838 non-null   float64
 8   crew_popularity    1838 non-null   float64
 9   tmdb_vote_average  1838 non-null   float64
 10  tmdb_vote_count    1838 non-null   Int64  
 11  director           1838 non-null   string 
 12  genre_id_0         1836 non-null   Int64  
 13  genre_id_1         1399 non-null   Int64  
 14  starring_0         1838 non-null   string 
 15  starring_1         1837 non-null   string 
 16  starring_2         1836 

In [15]:
## converting data types to appropriate ones
data_cleaning = data_cleaning.astype({
    'oscar_cat':'object',
    'director':'object',
    'genre_id_0':'object',
    'genre_id_1':'object',
    'starring_0':'object',
    'starring_1':'object',
    'starring_2':'object',
    'starring_3':'object',
    'producer_0':'object',
    'producer_1':'object',
    'screenplay_0':'object'
})

In [16]:
data_cleaning.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1838 entries, 0 to 1837
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   tmdb_id            1838 non-null   Int64  
 1   imdb_id            1838 non-null   string 
 2   film               1838 non-null   string 
 3   year               1838 non-null   Int64  
 4   nomination_count   1838 non-null   Int64  
 5   oscar_cat          1838 non-null   object 
 6   oscar_win          1838 non-null   boolean
 7   casts_popularity   1838 non-null   float64
 8   crew_popularity    1838 non-null   float64
 9   tmdb_vote_average  1838 non-null   float64
 10  tmdb_vote_count    1838 non-null   Int64  
 11  director           1838 non-null   object 
 12  genre_id_0         1836 non-null   object 
 13  genre_id_1         1399 non-null   object 
 14  starring_0         1838 non-null   object 
 15  starring_1         1837 non-null   object 
 16  starring_2         1836 

In [18]:
data_clean = data_cleaning.copy(deep=True)
data_clean.to_csv('./data/check_clean.csv')

---
### Test cells below
---

In [None]:
test = data_cleaning.copy(deep=True)
test = test[['genre_id','producers']]
#test = test.head(100)
## replacing by this method works, using list comprehension
test['genre_id'] = [func.strtolist(test['genre_id'][index]) for index in range(len(test))]
test['producers'] = [func.strtolist(test['producers'][index]) for index in range(len(test))]
display(test)

In [None]:
test_list = []
for i in range(len(test)):
    try:
        test_list.append(test['genre_id'][i][0])
    except:
        test_list.append(None)

test['genre_id_0'] = test_list


## do this last
test.convert_dtypes()
#test['genre_id'][3][0]
if 'genre_id' in test:
    print(1)

In [None]:
## WORKING
## look for current col that exists in 
for currheader in new_headers:
    ## look for target subcols, check if they exist first
    if currheader in test:
        for subcol in range(len(new_headers[currheader])):
            
            tmp_list = []
            for i in range(len(test)):
                try:
                    tmp_list.append(test[currheader][i][subcol])
                except:
                    tmp_list.append(None)

            test[new_headers[currheader][subcol]] = tmp_list
            
test = test.convert_dtypes()
test