In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from collections import Counter
from wordcloud import WordCloud
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff

# BACKEND:
- Number of movies per country (movies per genre per country "on click")
- Best Movies based on ratings per country
- Average rating per country
- Production companies

## Create the CSVs for the visualisations:

Read cleaned data:

In [2]:
DATA_DIR = "../../Data/"

In [68]:
df = pd.read_csv(DATA_DIR+"IMDb_clean.csv")

In [69]:
df

Unnamed: 0,title,year,genre,duration,country,language,director,writer,production_company,actors,avg_vote
0,Miss Jerry,1894,Romance,45,USA,,Alexander Black,Alexander Black,Alexander Black Photoplays,"Blanche Bayliss, William Courtenay, Chauncey D...",5.9
1,The Story of the Kelly Gang,1906,"Biography, Crime, Drama",70,Australia,,Charles Tait,Charles Tait,J. and N. Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be...",6.1
2,Cleopatra,1912,"Drama, History",100,USA,English,Charles L. Gaskill,Victorien Sardou,Helen Gardner Picture Players,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",5.2
3,L'Inferno,1911,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",Dante Alighieri,Milano Film,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",7.0
4,"From the Manger to the Cross; or, Jesus of Naz...",1912,"Biography, Drama",60,USA,English,Sidney Olcott,Gene Gauntier,Kalem Company,"R. Henderson Bland, Percy Dyer, Gene Gauntier,...",5.7
...,...,...,...,...,...,...,...,...,...,...,...
79432,Le lion,2020,Comedy,95,"France, Belgium",French,Ludovic Colbeau-Justin,"Alexandre Coquelle, Matthieu Le Naour",Monkey Pack Films,"Dany Boon, Philippe Katerine, Anne Serra, Samu...",5.3
79433,De Beentjes van Sint-Hildegard,2020,"Comedy, Drama",103,Netherlands,"German, Dutch",Johan Nijenhuis,"Radek Bajgar, Herman Finkers",Johan Nijenhuis & Co,"Herman Finkers, Johanna ter Steege, Leonie ter...",7.7
79434,Padmavyuhathile Abhimanyu,2019,Drama,130,India,Malayalam,Vineesh Aaradya,"Vineesh Aaradya, Vineesh Aaradya",RMCC Productions,"Anoop Chandran, Indrans, Sona Nair, Simon Brit...",7.9
79435,Sokagin Çocuklari,2019,"Drama, Family",98,Turkey,Turkish,Ahmet Faik Akinci,"Ahmet Faik Akinci, Kasim Uçkan",Gizem Ajans,"Ahmet Faik Akinci, Belma Mamati, Metin Keçeci,...",6.4


In [5]:
df.dtypes

title                  object
year                    int64
genre                  object
duration                int64
country                object
language               object
director               object
writer                 object
production_company     object
actors                 object
avg_vote              float64
dtype: object

**Separate the countries so we can analyze them separately.**

In [6]:
countries = {} #create empty dictionary
list_countries = list(df['country']) #set up a list of countries
for i in list_countries:
    i = list(i.split(',')) #split countries separated by commas
    if len(i) == 1: #if 1 country in countries.keys()
        if i in list(countries.keys()): #countries.keys() is the country name, countries.values() is the count of country name
            countries[i] +=1 #count
        else:
            countries[i[0]] = 0
    else:
        for j in i: #does the same but for len(i) != 1
            if j in list(countries.keys()):
                countries[j] += 1 #count
            else:
                countries[j] =1

**Separate the genre so we can analyze them separately.**

In [7]:
genre = list(df['genre'])
genre_list = [] #create an empty list

for i in genre:
    i = list(i.split(',')) #split words when comma
    for j in i:
        genre_list.append(j.replace(' ', '')) #replace extra space
        
genre_counter = Counter(genre_list) #We'll need it after

### **Distribution of movies per year:**

In [8]:
#groupby year and count how many title we have each year
title = df.groupby('year').agg({'title': ['count']})
title.columns = ['Title Count']
title = title.sort_index(ascending=False)
title.head()

Unnamed: 0_level_0,Title Count
year,Unnamed: 1_level_1
2020,684
2019,2491
2018,2888
2017,2951
2016,2851


In [9]:
title.to_csv(DATA_DIR+"Movies_per_year.csv")

### **Distribution of movies per Country:**

In [10]:
countries_fin = {} #create an empty dictionary
for country, no in countries.items(): 
    country = country.replace(' ', '') #remove the extra space create by the split(',')
    if country in list(countries_fin.keys()):
        countries_fin[country] += no
    else:
        countries_fin[country] = no

#sort the country according to their count
countries_fin = {k: v for k, v in sorted(countries_fin.items(), key = lambda item: item[1], reverse= True)}

In [11]:
countries_df =  pd.DataFrame(list(countries_fin.items()), columns = ['Country', 'Movies count'])

### **Movies per Country:**

In [13]:
countries_movies = {}
for country, group in df.groupby("country"):
    countries = [country.replace(' ', '') for country in country.split(",")]
    films =  [film for film in group["title"]]
    for bled in countries:
        if bled in list(countries_movies.keys()):
            countries_movies[bled] += films
        else:
            countries_movies[bled] = films

In [14]:
movies_per_countryDF = pd.DataFrame({"Country":countries_movies.keys(),"Movies":countries_movies.values()}).set_index("Country")

In [15]:
movies_per_countryDF.head(10)

Unnamed: 0_level_0,Movies
Country,Unnamed: 1_level_1
Afghanistan,"[Khakestar-o-khak, Wajma, Come pietra paziente..."
France,"[Khakestar-o-khak, Wajma, Come pietra paziente..."
Germany,"[Come pietra paziente, Seven Lucky Gods, Littl..."
UK,"[Come pietra paziente, Seven Lucky Gods, Littl..."
Iran,"[Chand metre moka'ab eshgh, Raftan, Rona, Mada..."
Ireland,"[Osama, Rojo, Zama, Boda secreta, El bonaerens..."
Japan,"[Osama, Rojo, Zama, Boda secreta, El bonaerens..."
Netherlands,"[Osama, Rojo, Zama, Boda secreta, El bonaerens..."
Albania,"[Përralle Nga e Kaluara, Lulekuqet mbi mure, N..."
Austria,"[Gjallë, Juana a los 12, Parabellum, Hamaca pa..."


### Fixing country ID problem:

In [27]:
countries_ids = pd.read_csv(DATA_DIR+"new_movies_count_per_country.csv")
countries_ids = countries_ids[["Country","Movies count","country_id"]]

In [28]:
countries_df["country_id"] = countries_ids["country_id"]
country_id_dict = countries_df[["Country","country_id"]].set_index("Country").to_dict()["country_id"]

### Use country IDS instead of names:

In [29]:
movies_per_countryDF.index = movies_per_countryDF.reset_index().Country.apply(lambda x: country_id_dict[x])

In [30]:
movies_per_countryDF

Unnamed: 0_level_0,Movies
Country,Unnamed: 1_level_1
AF,"[Khakestar-o-khak, Wajma, Come pietra paziente..."
FR,"[Khakestar-o-khak, Wajma, Come pietra paziente..."
DE,"[Come pietra paziente, Seven Lucky Gods, Littl..."
UK,"[Come pietra paziente, Seven Lucky Gods, Littl..."
IR,"[Chand metre moka'ab eshgh, Raftan, Rona, Mada..."
...,...
BZ,[Inheritance]
SV,[Relentless]
SZ,[Mikeyboy]
LS,[The Forgotten Kingdom]


In [31]:
movies_per_countryDF.to_csv(DATA_DIR+"Movies_per_country.csv")

In [32]:
countries_df = countries_df.drop("Country",axis=1).rename({"country_id":"Country"},axis=1).set_index("Country")

In [33]:
countries_df

Unnamed: 0_level_0,Movies count
Country,Unnamed: 1_level_1
US,3348
FR,3061
DE,1659
UK,1575
IT,1253
...,...
OM,1
GI,0
UG,0
MU,0


In [34]:
countries_df.to_csv(DATA_DIR+"Movies_count_byCountry.csv")

### **Genres count:**

In [17]:
genres = {k: v for k, v in sorted(genre_counter.items(), key=lambda item: item[1], reverse= True)}
genres_df = pd.DataFrame({"Genre":genres.keys(),"Count":genres.values()})
genres_df.head(10)

Unnamed: 0,Genre,Count
0,Drama,43799
1,Comedy,27276
2,Romance,13124
3,Action,12011
4,Thriller,10599
5,Crime,10473
6,Horror,8788
7,Adventure,7254
8,Mystery,4902
9,Family,3690


In [18]:
genres_df.to_csv(DATA_DIR+"Genres_count.csv")

### **Movies by Genre:**

In [19]:
genres_movies = {}
for genress, group in df.groupby("genre"):
    genres = [genre.replace(' ', '') for genre in genress.split(",")]
    films =  [film for film in group["title"]]
    for g in genres:
        if g in list(genres_movies.keys()):
            genres_movies[g] += films
        else:
            genres_movies[g] = films

In [20]:
movies_genres_df = pd.DataFrame({"Genre":genres_movies.keys(),"Movies":genres_movies.values()}).set_index("Genre")
movies_genres_df.head(10)

Unnamed: 0_level_0,Movies
Genre,Unnamed: 1_level_1
Action,"[Perils of Nyoka, Satan's Harvest, Madcaps il ..."
Adventure,"[Tarzan of the Apes, Adventures of Tarzan, Tar..."
Biography,"[Il conquistatore dell'India, I fucilieri dell..."
Comedy,"[The Three Must-Get-Theres, Come vinsi la guer..."
Crime,"[I vampiri, The Galloping Ghost, Uragano expre..."
Drama,"[The Perils of Pauline, The Last of the Mohica..."
Family,"[Il figlio di Tarzan, Il libro della giungla, ..."
Fantasy,"[Il mostro del mare, Adventures of Captain Mar..."
History,"[I vichinghi, Anime sul mare, Il terrore dell'..."
Horror,"[The Brain from Planet Arous, Argos contro le ..."


In [21]:
movies_genres_df.to_csv(DATA_DIR+"Movies_byGenre.csv")

### **Production Companies:**

In [22]:
production_companies_df = df[['production_company','title']].groupby(['production_company']).count().reset_index().rename(columns={'title':'number_of_movies'})
production_companies_df = production_companies_df.sort_values(by='number_of_movies', ascending=False)
production_companies_df.to_csv(DATA_DIR+"Movies_count_byProductionCompanies.csv")

**Movies per Production Companies:**

In [23]:
company_movies = {}
for prod_comp, group in df.groupby("production_company"):
    films =  [film for film in group["title"]]
    if prod_comp in list(company_movies.keys()):
        company_movies[prod_comp] += films
    else:
        company_movies[prod_comp] = films

In [24]:
movies_per_company_DF = pd.DataFrame({"Company":company_movies.keys(),"Movies":company_movies.values()}).set_index("Company")
movies_per_company_DF["Movies_count"] = movies_per_company_DF["Movies"].apply(lambda x: len(x))

In [25]:
movies_per_company_DF = movies_per_company_DF.sort_values(by='Movies_count', ascending=False)

In [26]:
movies_per_company_DF.to_csv(DATA_DIR+"Movies_byCompany.csv")

### **Average Rating by country:**

In [70]:
df.country = df.country.apply(lambda x: [country_id_dict[c.replace(" ","")] for c in x.split(",")])

In [75]:
df

Unnamed: 0,title,year,genre,duration,country,language,director,writer,production_company,actors,avg_vote
0,Miss Jerry,1894,Romance,45,[US],,Alexander Black,Alexander Black,Alexander Black Photoplays,"Blanche Bayliss, William Courtenay, Chauncey D...",5.9
1,The Story of the Kelly Gang,1906,"Biography, Crime, Drama",70,[AU],,Charles Tait,Charles Tait,J. and N. Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be...",6.1
2,Cleopatra,1912,"Drama, History",100,[US],English,Charles L. Gaskill,Victorien Sardou,Helen Gardner Picture Players,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",5.2
3,L'Inferno,1911,"Adventure, Drama, Fantasy",68,[IT],Italian,"Francesco Bertolini, Adolfo Padovan",Dante Alighieri,Milano Film,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",7.0
4,"From the Manger to the Cross; or, Jesus of Naz...",1912,"Biography, Drama",60,[US],English,Sidney Olcott,Gene Gauntier,Kalem Company,"R. Henderson Bland, Percy Dyer, Gene Gauntier,...",5.7
...,...,...,...,...,...,...,...,...,...,...,...
79432,Le lion,2020,Comedy,95,"[FR, BE]",French,Ludovic Colbeau-Justin,"Alexandre Coquelle, Matthieu Le Naour",Monkey Pack Films,"Dany Boon, Philippe Katerine, Anne Serra, Samu...",5.3
79433,De Beentjes van Sint-Hildegard,2020,"Comedy, Drama",103,[NL],"German, Dutch",Johan Nijenhuis,"Radek Bajgar, Herman Finkers",Johan Nijenhuis & Co,"Herman Finkers, Johanna ter Steege, Leonie ter...",7.7
79434,Padmavyuhathile Abhimanyu,2019,Drama,130,[IN],Malayalam,Vineesh Aaradya,"Vineesh Aaradya, Vineesh Aaradya",RMCC Productions,"Anoop Chandran, Indrans, Sona Nair, Simon Brit...",7.9
79435,Sokagin Çocuklari,2019,"Drama, Family",98,[TR],Turkish,Ahmet Faik Akinci,"Ahmet Faik Akinci, Kasim Uçkan",Gizem Ajans,"Ahmet Faik Akinci, Belma Mamati, Metin Keçeci,...",6.4


### **Best movies of each country ?**