In [43]:
import pandas as pd
import json

In [44]:
df = pd.read_csv('../data/movies.csv')

In [45]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,genres,id,keywords,title,overview,cast,crew,poster_path
0,0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",https://image.tmdb.org/t/p/w500https://image.t...


## **Helper Functions**

In [46]:
def extract_name(column):
    a = json.loads(column)
    names = [x['name'] for x in a]
    return names

def extract_name_from_cast(column):
    a = json.loads(column)[:3]
    names = [x['name'] for x in a]
    return names

def extract_director_name(column):
    a = json.loads(column)
    for x in a:
        if x['job'] == 'Dire  ctor':
            return x['name']
    return None

def preprocess_names(lst):
    return [x.replace(" ","") for x in lst]

def preprocess__crew_names(x):
    return x.replace(" ","")

## **Preprocessing**

In [47]:
df.loc[:, "genres"] = df["genres"].apply(extract_name)
df.loc[:, "keywords"] = df["keywords"].apply(extract_name)
df.loc[:, "cast"] = df["cast"].apply(extract_name_from_cast)
df.loc[:, "crew"] = df["crew"].apply(extract_director_name)

In [48]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,genres,id,keywords,title,overview,cast,crew,poster_path
0,0,"[Action, Adventure, Fantasy, Science Fiction]",19995,"[culture clash, future, space war, space colon...",Avatar,"In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",,https://image.tmdb.org/t/p/w500https://image.t...


In [49]:
df["crew"] = df["crew"].fillna("[]")
df = df.dropna()
df.isnull().sum()

Unnamed: 0     0
genres         0
id             0
keywords       0
title          0
overview       0
cast           0
crew           0
poster_path    0
dtype: int64

In [50]:
df.loc[:, 'overview'] = df['overview'].apply(lambda x:x.split())
df.loc[:, "genres"] = df["genres"].apply(preprocess_names)
df.loc[:, "keywords"] = df["keywords"].apply(preprocess_names)
df.loc[:, "cast"] = df["cast"].apply(preprocess_names)
df.loc[:, "crew"] = df["crew"].apply(preprocess__crew_names)
df.loc[:, "crew"] = df["crew"].apply(lambda x:[x])



In [51]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,genres,id,keywords,title,overview,cast,crew,poster_path
0,0,"[Action, Adventure, Fantasy, ScienceFiction]",19995,"[cultureclash, future, spacewar, spacecolony, ...",Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[[]],https://image.tmdb.org/t/p/w500https://image.t...


In [52]:
df['tags'] = df['overview'] + df['genres'] + df['keywords'] + df['cast'] + df['crew']

In [53]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,genres,id,keywords,title,overview,cast,crew,poster_path,tags
0,0,"[Action, Adventure, Fantasy, ScienceFiction]",19995,"[cultureclash, future, spacewar, spacecolony, ...",Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[[]],https://image.tmdb.org/t/p/w500https://image.t...,"[In, the, 22nd, century,, a, paraplegic, Marin..."


In [54]:
df = df[['id','title','tags','poster_path']]

In [55]:
df.loc[:,'tags'] = df['tags'].apply(lambda x:" ".join(x))
df.loc[:,'tags'] = df['tags'].apply(lambda x:x.lower())

In [56]:
df.head()

Unnamed: 0,id,title,tags,poster_path
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di...",https://image.tmdb.org/t/p/w500https://image.t...
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha...",https://image.tmdb.org/t/p/w500https://image.t...
2,206647,Spectre,a cryptic message from bond’s past sends him o...,https://image.tmdb.org/t/p/w500https://image.t...
3,49026,The Dark Knight Rises,following the death of district attorney harve...,https://image.tmdb.org/t/p/w500https://image.t...
4,49529,John Carter,"john carter is a war-weary, former military ca...",https://image.tmdb.org/t/p/w500https://image.t...


In [57]:
df.to_csv("../data/preprocessed_movies.csv")

In [None]:
import pickle

pickle.dump(df.to_dict(), open("../artifacts/movies.pkl", "wb"))