# Data Cleanse

In [1]:
import pandas as pd
import numpy as np

## Data description

In [2]:
raw=pd.read_csv('data/raw.csv')
raw

Unnamed: 0,tconst,name,year,rating,rating_count,story_line,genres,country,language,budget,gross,runtime,director,stars
0,tt2077677,3Simoa,2012.0,6.4,370,"A caper comedy about two friends, Simo and Las...",Comedy,Finland,Finnish,"EUR275,000","$93,296",1h 26min,Teemu Nikki,"Olli Rahkonen,Paula Vesala,Rami Rusinen"
1,tt0106332,Ba wang bie ji,1993.0,8.1,24830,"""Farewell, My Concubine"" is a movie with two p...","Drama,Music,Romance","China,Hong Kong",Mandarin,"$4,000,000","$5,985,074",2h 51min,Kaige Chen,"Fengyi Zhang,Leslie Cheung,Li Gong"
2,tt0116421,The Glimmer Man,1996.0,5.4,18687,"Jack Cole is a soft spoken, mystical, new age ...","Action,Comedy,Crime",USA,"Chinese,English,Russian","$45,000,000","$20,351,264",1h 31min,John Gray,"Bob Gunton,Keenen Ivory Wayans,Steven Seagal"
3,tt3170832,Room,2015.0,8.1,365849,"ROOM tells the extraordinary story of Jack, a ...","Drama,Thriller","Canada,Ireland,UK,USA",English,"$13,000,000","$35,401,758",1h 58min,Lenny Abrahamson,"Brie Larson,Jacob Tremblay,Sean Bridgers"
4,tt0204175,Boys and Girls,2000.0,5.4,15942,"Jennifer and Ryan are students at UC Berkeley,...","Comedy,Drama,Romance",USA,English,"$35,000,000","$25,850,615",1h 34min,Robert Iscove,"Brendon Ryan Barrett,Claire Forlani,Freddie Pr..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,tt2083231,Java Heat,2013.0,5.1,4629,Story centers on the aftermath of a suicide bo...,"Action,Crime,Thriller","Indonesia,USA","English,Indonesian","$15,000,000","$189,739",1h 44min,Conor Allyn,"Kellan Lutz,Mickey Rourke,Verdi Solaiman"
996,tt2158531,Holla II,2013.0,6.6,87,After narrowly escaping with her life at the h...,"Comedy,Horror",USA,English,"$1,000,000","$41,537",1h 30min,H.M. Coakley,"Greg Cipes,Kiely Williams,Vanessa Bell Calloway"
997,tt0079945,Star Trek: The Motion Picture,1979.0,6.4,81000,A massive alien spacecraft of enormous power d...,"Adventure,Mystery,Sci-Fi",USA,"English,Klingon","$35,000,000","$82,604,699",2h 12min,Robert Wise,"DeForest Kelley,Leonard Nimoy,William Shatner"
998,tt1653649,Chuzhaya,2010.0,6.5,873,A Ukrainian mob boss sends a group of henchmen...,"Action,Drama","Russia,USA",Russian,"$4,000,000","$1,838,858",1h 40min,Anton Bormatov,"Evgeniy Tkachuk,Kirill Polukhin,Natalya Romany..."


## Preprocessing function definition

In [3]:
def to_int(x):
    if type(x) == float:
        return x
    splits = x.split(',')
    a = 0
    for i in range(len(splits)-1,-1,-1):
        a += int(splits[i])*pow(1000, len(splits)-1-i)
    return a

def usd(x):
    if x == "":
        return x
    a = "USD"
    if x[0] == '$':
        a = a + x[1:]
        return a
    return x

def currency(x):
    return x[:3], x[3:]

def parse_time(time):
    if type(time) == float:
        return time
    l = time.strip().split()
    if len(l) == 2:
        return int(l[0][:-1])*60 + int(l[1][:-3])
    else:
        if l[0][-1] == 'h':
            return int(l[0][:-1])*60
        else:
            return int(l[0][:-3])
        
# cc[cc['currency']=='AED']['rate_to_usd'][0]
def parse_currency(x, cc_d):
    if type(x) == float:
        return x
    x = usd(x)
    k, v = currency(x)
    if k not in cc_d.keys():
        return np.nan
    return int(to_int(v)*cc_d[k])

## Clean

### Convert currency
*Convert 'gross' and 'budget' to USD*

In [4]:
# Get currency convert rate to USD
cc = pd.read_csv('data/currency.csv')
cc_list=cc['currency'].tolist()
cc_dict = dict()
for cc_item in cc_list:
    cc_dict[cc_item]=cc[cc['currency']==cc_item]['rate_to_usd'].values[0]

In [5]:
# Convert
cc_list = cc['currency'].tolist()
raw['gross'] = raw['gross'].apply(lambda x: parse_currency(x, cc_dict))
raw['budget'] = raw['budget'].apply(lambda x: parse_currency(x, cc_dict))
raw.head(5)

Unnamed: 0,tconst,name,year,rating,rating_count,story_line,genres,country,language,budget,gross,runtime,director,stars
0,tt2077677,3Simoa,2012.0,6.4,370,"A caper comedy about two friends, Simo and Las...",Comedy,Finland,Finnish,334784.0,93296,1h 26min,Teemu Nikki,"Olli Rahkonen,Paula Vesala,Rami Rusinen"
1,tt0106332,Ba wang bie ji,1993.0,8.1,24830,"""Farewell, My Concubine"" is a movie with two p...","Drama,Music,Romance","China,Hong Kong",Mandarin,4000000.0,5985074,2h 51min,Kaige Chen,"Fengyi Zhang,Leslie Cheung,Li Gong"
2,tt0116421,The Glimmer Man,1996.0,5.4,18687,"Jack Cole is a soft spoken, mystical, new age ...","Action,Comedy,Crime",USA,"Chinese,English,Russian",45000000.0,20351264,1h 31min,John Gray,"Bob Gunton,Keenen Ivory Wayans,Steven Seagal"
3,tt3170832,Room,2015.0,8.1,365849,"ROOM tells the extraordinary story of Jack, a ...","Drama,Thriller","Canada,Ireland,UK,USA",English,13000000.0,35401758,1h 58min,Lenny Abrahamson,"Brie Larson,Jacob Tremblay,Sean Bridgers"
4,tt0204175,Boys and Girls,2000.0,5.4,15942,"Jennifer and Ryan are students at UC Berkeley,...","Comedy,Drama,Romance",USA,English,35000000.0,25850615,1h 34min,Robert Iscove,"Brendon Ryan Barrett,Claire Forlani,Freddie Pr..."


### Convert runtime
*Convert from raw format <u>'(x)h (y)min'</u> string to float*

In [6]:
raw['runtime'] = raw['runtime'].apply(lambda x: parse_time(x))
raw['runtime']

0       86
1      171
2       91
3      118
4       94
      ... 
995    104
996     90
997    132
998    100
999     76
Name: runtime, Length: 1000, dtype: int64

### Convert rating_count
*Convert 'rating_count' from string to float*

In [7]:
raw.rating_count = raw.rating_count.apply(lambda x: to_int(x))
raw.head(5)

Unnamed: 0,tconst,name,year,rating,rating_count,story_line,genres,country,language,budget,gross,runtime,director,stars
0,tt2077677,3Simoa,2012.0,6.4,370,"A caper comedy about two friends, Simo and Las...",Comedy,Finland,Finnish,334784.0,93296,86,Teemu Nikki,"Olli Rahkonen,Paula Vesala,Rami Rusinen"
1,tt0106332,Ba wang bie ji,1993.0,8.1,24830,"""Farewell, My Concubine"" is a movie with two p...","Drama,Music,Romance","China,Hong Kong",Mandarin,4000000.0,5985074,171,Kaige Chen,"Fengyi Zhang,Leslie Cheung,Li Gong"
2,tt0116421,The Glimmer Man,1996.0,5.4,18687,"Jack Cole is a soft spoken, mystical, new age ...","Action,Comedy,Crime",USA,"Chinese,English,Russian",45000000.0,20351264,91,John Gray,"Bob Gunton,Keenen Ivory Wayans,Steven Seagal"
3,tt3170832,Room,2015.0,8.1,365849,"ROOM tells the extraordinary story of Jack, a ...","Drama,Thriller","Canada,Ireland,UK,USA",English,13000000.0,35401758,118,Lenny Abrahamson,"Brie Larson,Jacob Tremblay,Sean Bridgers"
4,tt0204175,Boys and Girls,2000.0,5.4,15942,"Jennifer and Ryan are students at UC Berkeley,...","Comedy,Drama,Romance",USA,English,35000000.0,25850615,94,Robert Iscove,"Brendon Ryan Barrett,Claire Forlani,Freddie Pr..."


### Save to CSV

In [8]:
raw.to_csv('data/cleaned.csv', index=False)