# Merge Data

In [1]:
import pandas as pd

data_path = '../data'
summary_path = f'{data_path}/MovieSummaries'

In [2]:
cmu_df = pd.read_csv(f'{summary_path}/movie_metadata_wikidata.csv')

In [3]:
cmu_df.head()

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,movie_name,movie_year,movie_revenue,movie_runtime,movie_languages,movie_countries,movie_genres,wikidata_id
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,English Language,United States of America,"Science Fiction, Space western, Horror, Supern...",Q261700
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,English Language,United States of America,"Crime Drama, Mystery, Biographical film, Drama",Q16250726
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,Norwegian Language,Norway,"Crime Fiction, Drama",Q4978832
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,English Language,United Kingdom,"Psychological thriller, Thriller, Erotic thriller",Q7995657
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,German Language,Germany,Drama,Q869644


In [4]:
cmu_df["cmu_year"] = cmu_df["movie_year"].apply(lambda x: str(x).split('-')[0])
cmu_df["cmu_year"] = cmu_df["cmu_year"].apply(pd.to_numeric, errors='coerce')

In [5]:
tmdb_df = pd.read_csv(f'{data_path}/TMDB_movie_dataset_v12.csv') 

In [6]:
tmdb_df['tmdb_year'] = tmdb_df['release_date'].apply(lambda x: str(x).split('-')[0])
tmdb_df['tmdb_year'] = tmdb_df['tmdb_year'].apply(pd.to_numeric, errors='coerce')

In [7]:
tmdb_df_out_nan = tmdb_df[tmdb_df['wikidata_id'].notnull()]
tmdb_only_cmu_df = tmdb_df_out_nan[tmdb_df_out_nan['wikidata_id'].isin(cmu_df['wikidata_id'])]

In [8]:
merged_df = pd.merge(cmu_df, tmdb_only_cmu_df, on='wikidata_id', how='inner')

In [9]:
missing_cmu_df = cmu_df[~cmu_df['wikidata_id'].isin(merged_df['wikidata_id'])]

In [10]:
len(missing_cmu_df)

10522

In [11]:
# merge missing cmu data with tmdb data on title and year

merged_missing_df = pd.merge(missing_cmu_df, tmdb_df, left_on=["movie_name", "cmu_year"], right_on=["title", "tmdb_year"], how='left')

In [12]:
merged_missing_df = merged_missing_df[~merged_missing_df["cmu_year"].isna()]

In [13]:
merged_df = pd.concat([merged_df, merged_missing_df])

In [14]:
filter_mask = merged_df['cmu_year'].isna()
merged_df.loc[filter_mask,'cmu_year'] = merged_df['tmdb_year'][filter_mask]

In [15]:
merged_df.drop(columns=['release_date', 'movie_year', 'tmdb_year'], inplace=True)

In [16]:
filter_mask = merged_df['movie_revenue'].isna()
merged_df.loc[filter_mask,'movie_revenue'] = merged_df['revenue'][filter_mask]

In [17]:
merged_df.drop(columns=['revenue'], inplace=True)

In [18]:
filter_mask = merged_df['movie_runtime'].isna()
merged_df.loc[filter_mask,'movie_runtime'] = merged_df['runtime'][filter_mask]

In [19]:
merged_df.drop(columns=['runtime'], inplace=True)

In [20]:
merged_df.loc[merged_df['movie_revenue'] == 0, 'movie_revenue'] = None
merged_df.loc[merged_df['movie_runtime'] == 0, 'movie_runtime'] = None

In [21]:
merged_df.drop(columns=['spoken_languages', 'production_companies', 'movie_languages', 'status', 'backdrop_path', 'homepage', 'original_title', 'poster_path', 'tagline'], inplace=True)

In [23]:
filter_mask = merged_df['movie_countries'].isna()
merged_df.loc[filter_mask,'movie_countries'] = merged_df['production_countries'][filter_mask]

In [24]:
merged_df.drop(columns=['production_countries'], inplace=True)

In [28]:
filter_mask = merged_df['movie_genres'].isna()
merged_df.loc[filter_mask,'movie_genres'] = merged_df['genres'][filter_mask]

In [29]:
merged_df.drop(columns=['genres'], inplace=True)

In [31]:
merged_df.drop(columns=['wikidata_id_y'], inplace=True)
merged_df.rename(columns={'wikidata_id_x': 'wikidata_id'}, inplace=True)

In [33]:
merged_df.rename(columns={'id': 'tmdb_id'}, inplace=True)

In [38]:
merged_df.loc[merged_df['vote_average'] == 0 , 'vote_average']  = None
merged_df.loc[merged_df['vote_count'] == 0 , 'vote_count']  = None
merged_df.loc[merged_df['budget'] == 0 , 'budget']  = None
merged_df.loc[merged_df['popularity'] == 0 , 'popularity']  = None


In [40]:
merged_df.to_csv(f'{data_path}/enrich_movie_data.csv', index=False)