## Beginning by importing packages with standard aliases

In [1]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np

## Importing the data as a dataframe
Indexing the first column as non-relavent

In [4]:
df = pd.read_csv('./zippedData/tmdb.movies.csv.gz', index_col = 0)


Getting a general idea of what the dataset looks like

In [5]:
df

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,"[28, 878, 12]",27205,en,Inception,27.920,2010-07-16,Inception,8.3,22186
...,...,...,...,...,...,...,...,...,...
26512,"[27, 18]",488143,en,Laboratory Conditions,0.600,2018-10-13,Laboratory Conditions,0.0,1
26513,"[18, 53]",485975,en,_EXHIBIT_84xxx_,0.600,2018-05-01,_EXHIBIT_84xxx_,0.0,1
26514,"[14, 28, 12]",381231,en,The Last One,0.600,2018-10-01,The Last One,0.0,1
26515,"[10751, 12, 28]",366854,en,Trailer Made,0.600,2018-06-22,Trailer Made,0.0,1


In [9]:
df.isna().sum()

genre_ids            0
id                   0
original_language    0
original_title       0
popularity           0
release_date         0
title                0
vote_average         0
vote_count           0
dtype: int64

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26517 entries, 0 to 26516
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   genre_ids          26517 non-null  object 
 1   id                 26517 non-null  int64  
 2   original_language  26517 non-null  object 
 3   original_title     26517 non-null  object 
 4   popularity         26517 non-null  float64
 5   release_date       26517 non-null  object 
 6   title              26517 non-null  object 
 7   vote_average       26517 non-null  float64
 8   vote_count         26517 non-null  int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 2.0+ MB


#### Key Takeaways:
- No null values
- Need to find a movie genre key to figure out what the genres mean
- Maybe need to limit the data to the most recent 5 years to stay relavent
    - Can maybe also plot popularity vs time to see trends that way?

## Found a movie genre key
Importing it below. (found at: https://www.themoviedb.org/talk/5daf6eb0ae36680011d7e6ee)

In [170]:
backwards_key = {
'Action' : '28',
'Adventure' : '12',
'Animation' : '16',
'Comedy' : '35',
'Crime' : '80',
'Documentary' : '99',
'Drama' : '18',
'Family' : '10751',
'Fantasy' : '14',
'History' : '36',
'Horror' : '27',
'Music' : '10402',
'Mystery' : '9648',
'Romance' : '10749',
'Science Fiction' : '878',
'TV Movie' : '10770',
'Thriller' : '53',
'War' : '10752',
'Western' : '37'
}

genre_key = {v: k for k, v in backwards_key.items()}
print(genre_key)

{'28': 'Action', '12': 'Adventure', '16': 'Animation', '35': 'Comedy', '80': 'Crime', '99': 'Documentary', '18': 'Drama', '10751': 'Family', '14': 'Fantasy', '36': 'History', '27': 'Horror', '10402': 'Music', '9648': 'Mystery', '10749': 'Romance', '878': 'Science Fiction', '10770': 'TV Movie', '53': 'Thriller', '10752': 'War', '37': 'Western'}


Checking the type of data for genre ID below. Goal is to create new columns with primary, secondary, tertiary, etc genres

In [89]:
print(df['genre_ids'][0])

[12, 14, 10751]


Realized that the genre_ids column is a string that just looks like a list,
need to do some data manipulation to clean it up

Making a list for char removal

In [99]:
char_remove = ["'", " ", "[", "]"]

In [110]:
for char in char_remove:
    current_row = df['genre_ids'][0]
    current_row = current_row.replace(char, '')
    current_row = current_row.split(',')
    
        

print(current_row)

['12', '14', '10751']


## created a for-loop that runs through each row of the column df['genre_id's]
the for-loop converts the string to a list of int\
then it takes each int and assigns it to a new list depending on where it is in the genre_id's list\
if there is no value for that iteration, it populates the list with Nonetype\
I include nonetype so that they stay on point with the correct index\

In [160]:
char_remove = ["'", " ", "[", "]"]

first_genre = []
second_genre = []
third_genre = []
fourth_genre = []
fifth_genre = []
sixth_genre = []
seventh_genre = []

for x in df['genre_ids']:
    row = x
    for char in char_remove:
        row = row.replace(char, '')
    row = row.split(',')
    first_genre.append(row[0])
    
    if len(row) == 1:
        second_genre.append(None)
        third_genre.append(None)
        fourth_genre.append(None)
        fifth_genre.append(None)
        sixth_genre.append(None)
        seventh_genre.append(None)
    if len(row) == 2:
        second_genre.append(row[1])
        third_genre.append(None)
        fourth_genre.append(None)
        fifth_genre.append(None)
        sixth_genre.append(None)
        seventh_genre.append(None)
    if len(row) == 3:
        second_genre.append(row[1])
        third_genre.append(row[2])
        fourth_genre.append(None)
        fifth_genre.append(None)
        sixth_genre.append(None)
        seventh_genre.append(None)
    if len(row) == 4:
        second_genre.append(row[1])
        third_genre.append(row[2])
        fourth_genre.append(row[3])
        fifth_genre.append(None)
        sixth_genre.append(None)
        seventh_genre.append(None)
    if len(row) == 5:
        second_genre.append(row[1])
        third_genre.append(row[2])
        fourth_genre.append(row[3])
        fifth_genre.append(row[4])
        sixth_genre.append(None)
        seventh_genre.append(None)
    if len(row) == 6:
        second_genre.append(row[1])
        third_genre.append(row[2])
        fourth_genre.append(row[3])
        fifth_genre.append(row[4])
        sixth_genre.append(row[5])
        seventh_genre.append(None)
    if len(row) == 7:
        second_genre.append(row[1])
        third_genre.append(row[2])
        fourth_genre.append(row[3])
        fifth_genre.append(row[4])
        sixth_genre.append(row[5])
        seventh_genre.append(row[6])
        
    if len(row) == 8:
        print('wowie')

In [159]:
len(second_genre)

26517

In [161]:
len(seventh_genre)

26517

## Now I am appending each new list as a column in the dataframe
Starting here with the first genre

In [157]:
seventh_genre

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [162]:
df['First_genre'] = first_genre

In [163]:
df.head()

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count,First_genre
0,121410751,12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,12
1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610,14
2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368,12
3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174,16
4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186,28


## using my dictionary for genres to rename the column appropriately

In [173]:
df["First_genre"].replace(genre_key, inplace=True)

In [174]:
df.head()

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count,First_genre
0,121410751,12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,Adventure
1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610,Fantasy
2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368,Adventure
3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174,Animation
4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186,Action
