## Preprocessing and Dataset Statistics

In [None]:
import pandas as pd
import numpy as np
import re
import sklearn
from sklearn import cluster
import matplotlib.pyplot as plt
import seaborn as sb
import matplotlib as g
from sklearn.preprocessing import OneHotEncoder
import random
import scipy
import scipy.signal

**Importing the Dataset**

In [None]:
movie = pd.read_pickle('cleaned_movie_set')
movie.head()

**Acquiring Genre Correlations**

In [None]:
df = movie
c = df.corr().abs()
s = c.unstack()
so = s.sort_values(ascending=False, kind="quicksort")
genre_corr = so[28::2]
genre_corr[0:28]

**Acquiring Cumulative Sums and Label Counts**

In [None]:
counts = [] #getting the counts per genre
df = movie
genres = df.columns[2::]
for i in genres:
    counts.append(df[i].value_counts().to_dict())
#creating counts table
counts_df = pd.DataFrame.from_dict(counts)
counts_df[2] = genres
counts_table = counts_df.drop([0], axis=1)
counts_table = counts_table.rename(columns = {1:'counts', 2:'genre'})
counts_sort = counts_table.sort('counts', ascending=False)
#creating column of cumulative sums
sumcol = counts_sort['counts'].sum()
cumsum = counts_sort['counts']/sumcol
counts_sort['cumsum'] = cumsum
counts_sort

**Displaying Correlation Matrix**

In [None]:
# df = movie
# genres = df.columns[2:].tolist()
# genres_rev = genres[::-1]
# df = df.drop(df.columns[0:2], axis=1)

#df_title = df.drop(df.columns[0:1], axis=1)
df = movie
genres = df.columns[1:].tolist()
genres_rev= genres[::-1]

In [None]:
#sb.palplot(sb.color_palette("hls", 7))
R = np.corrcoef(df,rowvar=0)
genre_heatmap = sb.heatmap(R)
genre_heatmap.set_xticklabels(genres, rotation=90)
genre_heatmap.set_yticklabels(genres_rev)
plt.show(genre_heatmap)

**Removing Irrelevant Label Genres**

In [None]:
garb = counts_sort['genre'].where(counts_sort['cumsum']<.04).tolist()
garb = garb[9:28]
garb

In [None]:
clean_df = df #clean dataset without garbage labels
clean_df = clean_df.drop(garb, axis=1) #removing the unwanted genres from our dataset
#clean_df = clean_df.drop(clean_df.columns[0], axis=1)
clean_df.head()

**New Correlations after Removing Excess Labels**

In [None]:
#New Correlations after removing excess labels
c = clean_df.corr().abs()
s = c.unstack()
so = s.sort_values(ascending=False, kind="quicksort")
genre_corr = so[9::2]
genre_corr[0:12]

**Dropping Non-Contextual Genres**

In [None]:
sixgenres = clean_df.drop(['Adventure', 'Documentary', 'Drama'], axis=1)
sixgenres['sample']=0
sixgenres.head()

**Checking Cumulative Sums after Removing Non-contextual Genres**

In [None]:
genres = sixgenres.columns[1:].tolist()
counts = []
for i in genres:
    counts.append(sixgenres[i].value_counts().to_dict())
counts_df = pd.DataFrame.from_dict(counts)
counts_df[2] = genres
counts_table = counts_df.drop([0], axis=1)
counts_table = counts_table.rename(columns = {1:'counts', 2:'genre'})
counts_sort = counts_table.sort('counts', ascending=False)
sumcol = counts_sort['counts'].sum()
cumsum = counts_sort['counts']/sumcol
counts_sort['cumsum'] = cumsum
counts_sort

**Downsamping Comedy to Balance Label Classes**

In [None]:
col_list = ['Action','Comedy','Crime','Horror','Romance','Thriller']
sixgenres['Empty'] = sixgenres[col_list].sum(axis = 1)
df = sixgenres[sixgenres.Empty != 0]
df.head()

In [None]:
comedy = df[df['Comedy'] == 1]
np.random.seed = 1
b = np.random.choice(comedy.Title,size = 4000, replace = False)
df2 = comedy[comedy['Title'].isin(b)]

In [None]:
df3 = df[(df_title['Title'].isin(df2.Title) & df['Comedy'] == 1) | (df['Comedy'] == 0)]

In [None]:
df3 = df[(df_title['Title'].isin(df2.Title) & df['Comedy'] == 1) | (df['Comedy'] == 0)]
df4 = df3.drop(df3.columns[-2:],axis= 1)
df4

**Double-Checking Class Balance**

In [None]:
genres = df4.columns[1:].tolist()
counts = []
for i in genres:
    counts.append(df4[i].value_counts().to_dict())
    #counts.append((df[i]==1).value_counts().to_dict())
#counts_df = []
counts_df = pd.DataFrame.from_dict(counts)
#counts_df = pd.DataFrame(counts)
#counts_df = pd.DataFrame(counts)
counts_df[2] = genres
#counts_df
counts_table = counts_df.drop([0], axis=1)
#counts_table = counts_table.drop([28])
counts_table = counts_table.rename(columns = {1:'counts', 2:'genre'})
counts_sort = counts_table.sort('counts', ascending=False)
#counts_sort
sumcol = counts_sort['counts'].sum()
cumsum = counts_sort['counts']/sumcol
counts_sort['cumsum'] = cumsum
counts_sort