In [2]:
import pandas as pd

df=pd.read_csv('../project_files/movies.csv')

In [3]:
df.shape

(10866, 21)

In [4]:
# Drop duplicate rows
df.drop_duplicates(inplace=True)
df.shape

(10865, 21)

---

### Movie Table
`movie(id, original_title)`

In [5]:
# id is primary key, drop rows with duplicate id
df.drop_duplicates(subset=['id'], inplace=True)

movies_table = df[['id','original_title']]
movies_table

Unnamed: 0,id,original_title
0,135397,Jurassic World
1,76341,Mad Max: Fury Road
2,262500,Insurgent
3,140607,Star Wars: The Force Awakens
4,168259,Furious 7
...,...,...
10861,21,The Endless Summer
10862,20379,Grand Prix
10863,39768,Beregis Avtomobilya
10864,21449,"What's Up, Tiger Lily?"


In [6]:
"""
INSERT INTO movie (id, original_title)
VALUES (135397, "Jurassic World");
"""
# First empty the file if it already exist, else create an empty file
file1 = open("../setup/movies_table.sql", "w")
file1.write("")
file1.close()

file1 = open("../setup/movies_table.sql", "a")  # append mode

for ind in movies_table.index:
    # original_title have single qoutes
    original_title = str(movies_table['original_title'][ind])
    original_title = original_title.replace("'", "''")
    file1.write(f"INSERT INTO movie (id, original_title) VALUES ({movies_table['id'][ind]},\'{original_title}\');\n")
file1.close()

### IMDb Details
`imdb_details(id, imdb_id, popularity, vote_count, vote_average)`

In [7]:
imdb_df = df.copy(deep=True)

# id is primary key, drop rows with duplicate id
imdb_df.drop_duplicates(['imdb_id'], inplace=True)

imdb_table = imdb_df[['id', 'imdb_id', 'popularity', 'vote_count', 'vote_average']]

imdb_table

Unnamed: 0,id,imdb_id,popularity,vote_count,vote_average
0,135397,tt0369610,32.985763,5562,6.5
1,76341,tt1392190,28.419936,6185,7.1
2,262500,tt2908446,13.112507,2480,6.3
3,140607,tt2488496,11.173104,5292,7.5
4,168259,tt2820852,9.335014,2947,7.3
...,...,...,...,...,...
10861,21,tt0060371,0.080598,11,7.4
10862,20379,tt0060472,0.065543,20,5.7
10863,39768,tt0060161,0.065141,11,6.5
10864,21449,tt0061177,0.064317,22,5.4


In [8]:
df.imdb_id.unique().shape

(10856,)

In [9]:
"""
INSERT INTO imdb_details (id, imdb_id, popularity, vote_count, vote_average)
VALUES (id, imdb_id, popularity, vote_count, vote_average);
"""
# First empty the file if it already exist, else create an empty file
file1 = open("../setup/imdb_table.sql", "w")
file1.write("")
file1.close()

file1 = open("../setup/imdb_table.sql", "a")  # append mode

for ind in imdb_table.index:
    file1.write(f"INSERT INTO imdb_details (id, imdb_id, popularity, vote_count, vote_average) VALUES ({imdb_table['id'][ind]}, '{imdb_table['imdb_id'][ind]}', {imdb_table['popularity'][ind]}, {imdb_table['vote_count'][ind]}, {imdb_table['vote_average'][ind]});\n")
file1.close()

### Genre Details
`movie_genre(id, genre)`

In [10]:
genre_df = df.copy(deep=True)
genre = genre_df[['id', 'genres']]
genre.loc[:, 'genres'] = genre.genres.apply(lambda name: str(name).split('|'))
genre.head(20)

Unnamed: 0,id,genres
0,135397,"[Action, Adventure, Science Fiction, Thriller]"
1,76341,"[Action, Adventure, Science Fiction, Thriller]"
2,262500,"[Adventure, Science Fiction, Thriller]"
3,140607,"[Action, Adventure, Science Fiction, Fantasy]"
4,168259,"[Action, Crime, Thriller]"
5,281957,"[Western, Drama, Adventure, Thriller]"
6,87101,"[Science Fiction, Action, Thriller, Adventure]"
7,286217,"[Drama, Adventure, Science Fiction]"
8,211672,"[Family, Animation, Adventure, Comedy]"
9,150540,"[Comedy, Animation, Family]"


In [11]:
"""
INSERT INTO movie_genre (id, genre)
VALUES (id, genre);
"""
# First empty the file if it already exist, else create an empty file
file1 = open("../setup/genre_table.sql", "w")
file1.write("")
file1.close()

file1 = open("../setup/genre_table.sql", "a")  # append mode

for ind in genre.index:
    for each_genre in genre['genres'][ind]:
        file1.write(f"INSERT INTO movie_genre (id, genre) VALUES ({genre['id'][ind]}, '{each_genre}');\n")
file1.close()

### Release Details
`release_details(id, release_date, release_year)`

In [12]:
release_df = df.copy(deep=True)
release_details = release_df[['id', 'release_date', 'release_year']]

for ind in release_details.index:
    dd_mm_yy = release_details['release_date'][ind]
    year =  release_details['release_year'][ind]
    # f-string to turn one digit integer to two-digit > f"{a:02}"
    dd = f"{int(release_details['release_date'][ind].split('/')[1]):02}"
    mm = f"{int(release_details['release_date'][ind].split('/')[0]):02}"
    yyyy_mm_dd = f"{year}-{mm}-{dd}"
    # release_details['release_date'][ind] = yyyy_mm_dd
    release_details.loc[ind, "release_date"] = yyyy_mm_dd

#release_details.loc[:, 'release_date'] = release_details.release_date.apply(lambda x: x.replace('/', '-'))
release_details

Unnamed: 0,id,release_date,release_year
0,135397,2015-06-09,2015
1,76341,2015-05-13,2015
2,262500,2015-03-18,2015
3,140607,2015-12-15,2015
4,168259,2015-04-01,2015
...,...,...,...
10861,21,1966-06-15,1966
10862,20379,1966-12-21,1966
10863,39768,1966-01-01,1966
10864,21449,1966-11-02,1966


In [13]:
"""
INSERT INTO release_details (id, release_date, release_year)
VALUES (id, release_date, release_year);
"""
# First empty the file if it already exist, else create an empty file
file1 = open("../setup/release_table.sql", "w")
file1.write("")
file1.close()

file1 = open("../setup/release_table.sql", "a")  # append mode

for ind in release_details.index:
    file1.write(f"INSERT INTO release_details (id, release_date, release_year) VALUES ({release_details['id'][ind]}, \'{release_details['release_date'][ind]}\', {release_details['release_year'][ind]});\n")
file1.close()

### Metadata
`metadata(id, keywords, tagline, runtime, homepage, overview)`

In [14]:
metadata_df = df.copy(deep=True)
metadata = metadata_df[['id', 'keywords', 'tagline', 'runtime', 'homepage', 'overview']]
metadata.loc[:, 'keywords'] = metadata.keywords.apply(lambda name: str(name).replace('|', ', '))
metadata.loc[:, 'keywords'] = metadata.keywords.apply(lambda name: "{"+name+"}") #since keywords is charvar list

# Dealing with single qoutes in text fields
metadata.loc[:, 'keywords'] = metadata.keywords.apply(lambda text: str(text).replace("'", "''"))
metadata.loc[:, 'tagline'] = metadata.tagline.apply(lambda text: str(text).replace("'", "''"))
metadata.loc[:, 'homepage'] = metadata.homepage.apply(lambda text: str(text).replace("'", "''"))
metadata.loc[:, 'overview'] = metadata.overview.apply(lambda text: str(text).replace("'", "''"))


metadata.head()

Unnamed: 0,id,keywords,tagline,runtime,homepage,overview
0,135397,"{monster, dna, tyrannosaurus rex, velociraptor...",The park is open.,124,http://www.jurassicworld.com/,Twenty-two years after the events of Jurassic ...
1,76341,"{future, chase, post-apocalyptic, dystopia, au...",What a Lovely Day.,120,http://www.madmaxmovie.com/,An apocalyptic story set in the furthest reach...
2,262500,"{based on novel, revolution, dystopia, sequel,...",One Choice Can Destroy You,119,http://www.thedivergentseries.movie/#insurgent,Beatrice Prior must confront her inner demons ...
3,140607,"{android, spaceship, jedi, space opera, 3d}",Every generation has a story.,136,http://www.starwars.com/films/star-wars-episod...,Thirty years after defeating the Galactic Empi...
4,168259,"{car race, speed, revenge, suspense, car}",Vengeance Hits Home,137,http://www.furious7.com/,Deckard Shaw seeks revenge against Dominic Tor...


In [15]:
"""
INSERT INTO metadata (id, keywords, tagline, runtime, homepage, overview)
VALUES (id, keywords, tagline, runtime, homepage, overview);
"""
# First empty the file if it already exist, else create an empty file
file1 = open("../setup/metadata_table.sql", "w")
file1.write("")
file1.close()

file1 = open("../setup/metadata_table.sql", "a")  # append mode

for ind in metadata.index:
    file1.write(f"INSERT INTO metadata (id, keywords, tagline, runtime, homepage, overview) VALUES ({metadata['id'][ind]}, \'{metadata['keywords'][ind]}\', \'{metadata['tagline'][ind]}\', {metadata['runtime'][ind]}, \'{metadata['homepage'][ind]}\', \'{metadata['overview'][ind]}\');\n")
file1.close()

In [16]:
metadata.shape

(10865, 6)

### Finance Details
`finances(id, budget, revenue, budget_adj, revenue_adj)`

In [28]:
# To deal with profit margin (revenue to budget ratio)
# we replace budget 0 with 1, to avoid 'division-by-zero' error and still represent proper profit margin
df.loc[:, 'budget'] = df.budget.apply(lambda x: x if x!=0 else 1)

# And now the finances table
finances = df[['id','budget', 'revenue', 'budget_adj', 'revenue_adj']]
finances

Unnamed: 0,id,budget,revenue,budget_adj,revenue_adj
0,135397,150000000,1513528810,1.379999e+08,1.392446e+09
1,76341,150000000,378436354,1.379999e+08,3.481613e+08
2,262500,110000000,295238201,1.012000e+08,2.716190e+08
3,140607,200000000,2068178225,1.839999e+08,1.902723e+09
4,168259,190000000,1506249360,1.747999e+08,1.385749e+09
...,...,...,...,...,...
10861,21,1,0,0.000000e+00,0.000000e+00
10862,20379,1,0,0.000000e+00,0.000000e+00
10863,39768,1,0,0.000000e+00,0.000000e+00
10864,21449,1,0,0.000000e+00,0.000000e+00


In [29]:
"""
INSERT INTO finances (id, budget, revenue, budget_adj, revenue_adj)
VALUES (id, budget, revenue, budget_adj, revenue_adj);
"""
# First empty the file if it already exist, else create an empty file
file1 = open("../setup/finances_table.sql", "w")
file1.write("")
file1.close()

file1 = open("../setup/finances_table.sql", "a")  # append mode

for ind in finances.index:
    file1.write(f"INSERT INTO finances (id, budget, revenue, budget_adj, revenue_adj) VALUES ({finances['id'][ind]}, {finances['budget'][ind]}, {finances['revenue'][ind]}, {finances['budget_adj'][ind]}, {finances['revenue_adj'][ind]});\n")
file1.close()

### Movie-Director
`directed(id, director_name)`

In [19]:
directed_df = df.copy(deep=True)
directed = directed_df[['id','director']]
print(directed.head(20))

        id                        director
0   135397                 Colin Trevorrow
1    76341                   George Miller
2   262500                Robert Schwentke
3   140607                     J.J. Abrams
4   168259                       James Wan
5   281957  Alejandro GonzÃ¡lez IÃ±Ã¡rritu
6    87101                     Alan Taylor
7   286217                    Ridley Scott
8   211672        Kyle Balda|Pierre Coffin
9   150540                     Pete Docter
10  206647                      Sam Mendes
11   76757  Lana Wachowski|Lilly Wachowski
12  264660                    Alex Garland
13  257344                  Chris Columbus
14   99861                     Joss Whedon
15  273248               Quentin Tarantino
16  260346                 Olivier Megaton
17  102899                     Peyton Reed
18  150689                 Kenneth Branagh
19  131634                Francis Lawrence


In [20]:
directed.loc[:, 'director'] = directed.director.apply(lambda name: str(name).split('|'))
directed.head(20)

Unnamed: 0,id,director
0,135397,[Colin Trevorrow]
1,76341,[George Miller]
2,262500,[Robert Schwentke]
3,140607,[J.J. Abrams]
4,168259,[James Wan]
5,281957,[Alejandro GonzÃ¡lez IÃ±Ã¡rritu]
6,87101,[Alan Taylor]
7,286217,[Ridley Scott]
8,211672,"[Kyle Balda, Pierre Coffin]"
9,150540,[Pete Docter]


In [21]:
"""
INSERT INTO directed (id, director_name)
VALUES (id, director_name);
"""
# First empty the file if it already exist, else create an empty file
file1 = open("../setup/director_table.sql", "w")
file1.write("")
file1.close()

file1 = open("../setup/director_table.sql", "a")  # append mode

for ind in directed.index:
    for each_director in directed['director'][ind]:
        # There are single qoutes in director names, so we replace them with two consecutive single qoutes to escape
        each_director = each_director.replace("'", "''")
        file1.write(f"INSERT INTO directed (id, director_name) VALUES ({directed['id'][ind]}, '{each_director}');\n")
file1.close()

### Casting Details
`movie_cast(id, actor_name)`

In [22]:
casting_df = df.copy(deep=True)
movie_cast = casting_df[['id','cast']]
print(movie_cast.head(20))

        id                                               cast
0   135397  Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...
1    76341  Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...
2   262500  Shailene Woodley|Theo James|Kate Winslet|Ansel...
3   140607  Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...
4   168259  Vin Diesel|Paul Walker|Jason Statham|Michelle ...
5   281957  Leonardo DiCaprio|Tom Hardy|Will Poulter|Domhn...
6    87101  Arnold Schwarzenegger|Jason Clarke|Emilia Clar...
7   286217  Matt Damon|Jessica Chastain|Kristen Wiig|Jeff ...
8   211672  Sandra Bullock|Jon Hamm|Michael Keaton|Allison...
9   150540  Amy Poehler|Phyllis Smith|Richard Kind|Bill Ha...
10  206647  Daniel Craig|Christoph Waltz|LÃ©a Seydoux|Ralp...
11   76757  Mila Kunis|Channing Tatum|Sean Bean|Eddie Redm...
12  264660  Domhnall Gleeson|Alicia Vikander|Oscar Isaac|S...
13  257344  Adam Sandler|Michelle Monaghan|Peter Dinklage|...
14   99861  Robert Downey Jr.|Chris Hemsworth|Mark Ruffalo...
15  2732

In [23]:
movie_cast.loc[:, 'cast'] = movie_cast.cast.apply(lambda name: str(name).split('|'))
movie_cast.head(20)

Unnamed: 0,id,cast
0,135397,"[Chris Pratt, Bryce Dallas Howard, Irrfan Khan..."
1,76341,"[Tom Hardy, Charlize Theron, Hugh Keays-Byrne,..."
2,262500,"[Shailene Woodley, Theo James, Kate Winslet, A..."
3,140607,"[Harrison Ford, Mark Hamill, Carrie Fisher, Ad..."
4,168259,"[Vin Diesel, Paul Walker, Jason Statham, Miche..."
5,281957,"[Leonardo DiCaprio, Tom Hardy, Will Poulter, D..."
6,87101,"[Arnold Schwarzenegger, Jason Clarke, Emilia C..."
7,286217,"[Matt Damon, Jessica Chastain, Kristen Wiig, J..."
8,211672,"[Sandra Bullock, Jon Hamm, Michael Keaton, All..."
9,150540,"[Amy Poehler, Phyllis Smith, Richard Kind, Bil..."


In [24]:
"""
INSERT INTO cast (id, actor_name)
VALUES (id, actor_name);
"""
# First empty the file if it already exist, else create an empty file
file1 = open("../setup/actors_table.sql", "w")
file1.write("")
file1.close()

file1 = open("../setup/actors_table.sql", "a")  # append mode

for ind in movie_cast.index:
    for each_actor in movie_cast['cast'][ind]:
        # There are single qoutes in actor names, so we replace them with two consecutive single qoutes to escape
        each_actor = each_actor.replace("'", "''")
        file1.write(f"INSERT INTO movie_cast (id, actor_name) VALUES ({movie_cast['id'][ind]}, '{each_actor}');\n")
file1.close()

### Production Details
`production(id, production_company)`

In [25]:
production_df = df.copy(deep=True)
production = production_df[['id','production_companies']]
print(production.head(20))

        id                               production_companies
0   135397  Universal Studios|Amblin Entertainment|Legenda...
1    76341  Village Roadshow Pictures|Kennedy Miller Produ...
2   262500  Summit Entertainment|Mandeville Films|Red Wago...
3   140607          Lucasfilm|Truenorth Productions|Bad Robot
4   168259  Universal Pictures|Original Film|Media Rights ...
5   281957  Regency Enterprises|Appian Way|CatchPlay|Anony...
6    87101            Paramount Pictures|Skydance Productions
7   286217  Twentieth Century Fox Film Corporation|Scott F...
8   211672      Universal Pictures|Illumination Entertainment
9   150540  Walt Disney Pictures|Pixar Animation Studios|W...
10  206647                       Columbia Pictures|Danjaq|B24
11   76757  Village Roadshow Pictures|Dune Entertainment|A...
12  264660  DNA Films|Universal Pictures International (UP...
13  257344        Columbia Pictures|Happy Madison Productions
14   99861  Marvel Studios|Prime Focus|Revolution Sun Studios
15  2732

In [26]:
production.loc[:, 'production_companies'] = production.production_companies.apply(lambda name: str(name).split('|'))
production.head(20)

Unnamed: 0,id,production_companies
0,135397,"[Universal Studios, Amblin Entertainment, Lege..."
1,76341,"[Village Roadshow Pictures, Kennedy Miller Pro..."
2,262500,"[Summit Entertainment, Mandeville Films, Red W..."
3,140607,"[Lucasfilm, Truenorth Productions, Bad Robot]"
4,168259,"[Universal Pictures, Original Film, Media Righ..."
5,281957,"[Regency Enterprises, Appian Way, CatchPlay, A..."
6,87101,"[Paramount Pictures, Skydance Productions]"
7,286217,"[Twentieth Century Fox Film Corporation, Scott..."
8,211672,"[Universal Pictures, Illumination Entertainment]"
9,150540,"[Walt Disney Pictures, Pixar Animation Studios..."


In [27]:
"""
INSERT INTO production (id, production_company)
VALUES (id, production_company);
"""
# First empty the file if it already exist, else create an empty file
file1 = open("../setup/producers_table.sql", "w")
file1.write("")
file1.close()

file1 = open("../setup/producers_table.sql", "a")  # append mode

for ind in production.index:
    for each_company in production['production_companies'][ind]:
        # There are single qoutes in actor names, so we replace them with two consecutive single qoutes to escape
        each_company = each_company.replace("'", "''")
        file1.write(f"INSERT INTO production (id, production_company) VALUES ({production['id'][ind]}, '{each_company}');\n")
file1.close()