In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats as scs
from importlib import reload
from sklearn.model_selection import train_test_split
import warnings

# ignore warnings 
warnings.filterwarnings('ignore')
# other stuff people do 
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# set the dataframe for movies
movies = pd.read_table('data/movies.dat', sep = "::", header=None)
movies.columns = ['movie_id', 'movie_title', 'genre']

# return a list containing each row from the dataframe (movies)
temp = []
for row in movies.iterrows():
  index, data = row
  temp.append(data.tolist())

# return a list containing each string from the 'movie_title' column
titles_w_year = []
for row in temp:
    titles_w_year.append(row.pop(1))

'''
each row is a string combining the title and year of each movie
    e.g. ["Toy Story (1995)"]
here, for each row, we replace ')' with ''
    and then split the string at '('
        which turns the one string (remember, each row was one string) into a list of two strings
            e.g. ["Toy Story ", "1995"]
'''
titles_and_years = []
for title in titles_w_year:
    titles_and_years.append(title.replace(')', '').split('('))

'''
return two lists:
    titles of movies [titles]
    years of movies [years]
'''
titles = []
years = []
for duo in titles_and_years:
    titles.append(duo.pop(0))
    years.append(duo.pop())
    
# add each list to the dataframe as a new column
movies['movie_title'] = pd.Series(titles).values 
movies['year'] = pd.Series(years).values

# return a list containing each string from the 'genre' column
genre = []
for row in temp:
    genre.append(row.pop(1))
    
'''
each row is a string combining each genre of the movie
    e.g. "Animation|Children's|Comedy"
here, for each row, we split the string at '|'
        which turns the one string into a list of strings (one string per genre)
            e.g. ['Animation', "Children's", 'Comedy']
'''
lists_of_genres = []
for genre_cluster in genre:
    lists_of_genres.append(genre_cluster.split('|'))
    
# return a list of each unique genre
unique_genres = []
for list_of_genres in lists_of_genres:
    for genre in list_of_genres:
        if genre not in unique_genres:
            unique_genres.append(genre)

'''
unique_genres = ['Animation', "Children's", 'Comedy', 'Adventure',
 'Fantasy', 'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
 'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir', 'Western']

len(unique_genres) = 18
'''

'''
now that we have the genres somewhat isolated, we need to add a column for each genre 
    this will be done in the form of a 'dummy variable'
        i.g. the column will contain 0 if the movie is not in that genre
            and 1 if the movie is in that genre
    to start this process, we will make 18 lists of 1 and 0 values 
'''
# make the lists
animation = [] 
childrens = []
comedy = []
adventure = []
fantasy = []
romance = []
drama = []
action = []
crime = []
thriller = []
horror = []
scifi = []
documentry = []
war = []
musical = []
mystery = []
filmnoir = []
western = []
# iterate through the lists_of_genres 
for list_of_genres in lists_of_genres:
    # identify through genres in the list_of_genres and adjust lists     
    if 'Animation' in list_of_genres:
        animation.append(1)
    else:
        animation.append(0)
    if "Children's" in list_of_genres:
        childrens.append(1)
    else:
        childrens.append(0)
    if 'Comedy' in list_of_genres:
        comedy.append(1)
    else:
        comedy.append(0)
    if 'Adventure' in list_of_genres:
        adventure.append(1)
    else:
        adventure.append(0)
    if 'Fantasy' in list_of_genres:
        fantasy.append(1)
    else:
        fantasy.append(0)
    if 'Romance' in list_of_genres:
        romance.append(1)
    else:
        romance.append(0)
    if 'Drama' in list_of_genres:
        drama.append(1)
    else:
        drama.append(0)
    if 'Action' in list_of_genres:
        action.append(1)
    else:
        action.append(0)
    if 'Crime' in list_of_genres:
        crime.append(1)
    else:
        crime.append(0)
    if 'Thriller' in list_of_genres:
        thriller.append(1)
    else:
        thriller.append(0)
    if 'Horror' in list_of_genres:
        horror.append(1)
    else:
        horror.append(0)
    if 'Sci-Fi' in list_of_genres:
        scifi.append(1)
    else:
        scifi.append(0)
    if 'Documentary' in list_of_genres:
        documentry.append(1)
    else:
        documentry.append(0)
    if 'War' in list_of_genres:
        war.append(1)
    else:
        war.append(0)
    if 'Musical' in list_of_genres:
        musical.append(1)
    else:
        musical.append(0)
    if 'Mystery' in list_of_genres:
        mystery.append(1)
    else:
        mystery.append(0)
    if 'Film-Noir' in list_of_genres:
        filmnoir.append(1)
    else:
        filmnoir.append(0)
    if 'Western' in list_of_genres:
        western.append(1)
    else:
        western.append(0)

unique_genres = ['Animation', "Children's", 'Comedy', 'Adventure',
 'Fantasy', 'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
 'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir', 'Western']
# collect the lists
genre_dummy_lists = animation, childrens, comedy, adventure, fantasy, romance, drama, action, crime, thriller, horror, scifi, documentry, war, musical, mystery, filmnoir, western

# add a column for each dummy_list, pulling titles from unique_genres
_ = 0
for dummy_list in genre_dummy_lists:
    title = unique_genres[_] 
    _ += 1
    movies[title] = dummy_list

# drop the old 'genre' column
movies = movies.drop(labels=['genre'], axis=1)


'''# let's see which genres are most common
__ = 0
for dummy_list in genre_dummy_lists:
    title = unique_genres[_] 
    __ += 1
    movies[title] = dummy_list  '''


"# let's see which genres are most common\n__ = 0\nfor dummy_list in genre_dummy_lists:\n    title = unique_genres[_] \n    __ += 1\n    movies[title] = dummy_list  "

In [3]:
movies.to_csv('data/one_hot_data.csv')