# Location/Langauge Distribution of Movie Production

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px

In [2]:
movies_data = pd.read_csv('Datasets/cleaned-movies-metainformation.csv')

In [3]:
movies_data.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'id',
       'original_language', 'original_title', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'title',
       'vote_average', 'vote_count'],
      dtype='object')

## Location

In [4]:
movies_data_location = movies_data[['id', 'title', 'production_countries', 'production_companies']]

In [5]:
movies_data_location.head()

Unnamed: 0,id,title,production_countries,production_companies
0,862,Toy Story,"[{'iso_3166_1': 'US', 'name': 'United States o...","[{'name': 'Pixar Animation Studios', 'id': 3}]"
1,8844,Jumanji,"[{'iso_3166_1': 'US', 'name': 'United States o...","[{'name': 'TriStar Pictures', 'id': 559}, {'na..."
2,15602,Grumpier Old Men,"[{'iso_3166_1': 'US', 'name': 'United States o...","[{'name': 'Warner Bros.', 'id': 6194}, {'name'..."
3,31357,Waiting to Exhale,"[{'iso_3166_1': 'US', 'name': 'United States o...",[{'name': 'Twentieth Century Fox Film Corporat...
4,11862,Father of the Bride Part II,"[{'iso_3166_1': 'US', 'name': 'United States o...","[{'name': 'Sandollar Productions', 'id': 5842}..."


To get the information from the production_countries column, I am going to use some functions I created.

In [6]:
#makes column from those weird strings of dictonaries of lists
def makeColumn(series, key):
    count = 0
    col = []
    string = ""
    for lis in series:
        li = lis.strip('][').split(', ')
        for dic in li:
            di = dic.strip('}{').replace("'", '').split(':')
            if di[0] == key:
                string += di[1]
                string += ","
        col.append(string)
        string = ""
    return pd.Series(col)

In [7]:
#function that counts amount of each item in 'ser'
def count_amount(ser, ind):
    li = []
    for i in ind:
        total = 0
        for j in ser:
            if i in j:
                total += 1
        li.append(total)
    return li

In [8]:
def convert(string): 
    li = list(string.split(","))
    li = [x.strip(' ') for x in li]
    return li

In [9]:
movies_data_location['Production Country'] = makeColumn(movies_data_location['production_countries'], 'name')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_data_location['Production Country'] = makeColumn(movies_data_location['production_countries'], 'name')


In [10]:
movies_data_location

Unnamed: 0,id,title,production_countries,production_companies,Production Country
0,862,Toy Story,"[{'iso_3166_1': 'US', 'name': 'United States o...","[{'name': 'Pixar Animation Studios', 'id': 3}]","United States of America,"
1,8844,Jumanji,"[{'iso_3166_1': 'US', 'name': 'United States o...","[{'name': 'TriStar Pictures', 'id': 559}, {'na...","United States of America,"
2,15602,Grumpier Old Men,"[{'iso_3166_1': 'US', 'name': 'United States o...","[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","United States of America,"
3,31357,Waiting to Exhale,"[{'iso_3166_1': 'US', 'name': 'United States o...",[{'name': 'Twentieth Century Fox Film Corporat...,"United States of America,"
4,11862,Father of the Bride Part II,"[{'iso_3166_1': 'US', 'name': 'United States o...","[{'name': 'Sandollar Productions', 'id': 5842}...","United States of America,"
...,...,...,...,...,...
43981,439050,Subdue,"[{'iso_3166_1': 'IR', 'name': 'Iran'}]",[],"Iran,"
43982,111109,Century of Birthing,"[{'iso_3166_1': 'PH', 'name': 'Philippines'}]","[{'name': 'Sine Olivia', 'id': 19653}]","Philippines,"
43983,67758,Betrayal,"[{'iso_3166_1': 'US', 'name': 'United States o...","[{'name': 'American World Pictures', 'id': 6165}]","United States of America,"
43984,227506,Satan Triumphant,"[{'iso_3166_1': 'RU', 'name': 'Russia'}]","[{'name': 'Yermoliev', 'id': 88753}]","Russia,"


In [13]:
movies_data_location['Production Country']

0         United States of America,
1         United States of America,
2         United States of America,
3         United States of America,
4         United States of America,
                    ...            
43981                         Iran,
43982                  Philippines,
43983     United States of America,
43984                       Russia,
43985               United Kingdom,
Name: Production Country, Length: 43986, dtype: object

In [19]:
movies_data_location[movies_data_location['title'] == "Toy Story"]['id'].sum() / movies_data_location[movies_data_location['title'] == "Toy Story"]['id'].count()

862.0

In [11]:
con_list = []
for i in movies_data_location['Production Country']:
    con_list = con_list + convert(i)
movie_countries = np.unique(con_list)

In [12]:
movie_countries = movie_countries.tolist()[1:]
movie_countries

['"Cote DIvoire"',
 '"Lao Peoples Democratic Republic"',
 'Afghanistan',
 'Albania',
 'Algeria',
 'Angola',
 'Antarctica',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Bermuda',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei Darussalam',
 'Bulgaria',
 'Burkina Faso',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cayman Islands',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Congo',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czech Republic',
 'Czechoslovakia',
 'Denmark',
 'Dominican Republic',
 'East Germany',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Estonia',
 'Ethiopia',
 'Finland',
 'France',
 'French Polynesia',
 'French Southern Territories',
 'Georgia',
 'Germany',
 'Ghana',
 'Gibraltar',
 'Greece',
 'Guatemala',
 'Guinea',
 'Honduras',
 'Hong Kong',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'Iran',
 'Iraq',
 'Ireland',
 'Israel',
 'Italy',
 'Ja

In [13]:
country_counts = count_amount(movies_data_location['Production Country'], movie_countries)
country_counts

[2,
 3,
 8,
 5,
 13,
 3,
 1,
 252,
 5,
 6,
 557,
 197,
 2,
 4,
 2,
 1,
 9,
 443,
 1,
 4,
 10,
 34,
 3,
 286,
 1,
 56,
 10,
 9,
 5,
 1705,
 1,
 4,
 67,
 371,
 34,
 4,
 6,
 77,
 24,
 7,
 198,
 4,
 374,
 8,
 6,
 10,
 25,
 3,
 57,
 5,
 377,
 3787,
 1,
 1,
 30,
 2224,
 4,
 1,
 167,
 6,
 3,
 1,
 595,
 169,
 65,
 825,
 33,
 101,
 8,
 221,
 146,
 2167,
 4,
 1607,
 9,
 23,
 2,
 1,
 6,
 27,
 15,
 2,
 3,
 8,
 36,
 89,
 2,
 15,
 1,
 13,
 2,
 9,
 1,
 5,
 326,
 1,
 4,
 3,
 10,
 33,
 1,
 3,
 6,
 362,
 1,
 123,
 3,
 6,
 3,
 200,
 18,
 11,
 7,
 2,
 3,
 30,
 83,
 305,
 119,
 13,
 14,
 141,
 857,
 4,
 1,
 2,
 14,
 109,
 3,
 41,
 29,
 32,
 1,
 124,
 494,
 15,
 947,
 3,
 582,
 249,
 5,
 129,
 3,
 2,
 115,
 2,
 18,
 165,
 2,
 55,
 27,
 4004,
 1,
 20587,
 18,
 5,
 19,
 10,
 4,
 3]

In [14]:
movie_country_count = pd.DataFrame()
movie_country_count['country'] = movie_countries
movie_country_count['count'] = country_counts
movie_country_count
movie_country_count.iloc[153, movie_country_count.columns.get_loc('country')] ='United States'

In [15]:
#get iso_alpha values
df = px.data.gapminder().query("year==2007")
df = df[['country', 'iso_alpha', 'continent']]

#merge data on country
movie_country_count = movie_country_count.merge(df, on="country")

In [19]:
movie_country_count['proportion'] = (movie_country_count['count'] / movie_country_count['count'].sum()) * 100
movie_country_count

Unnamed: 0,country,count,iso_alpha,continent,proportion
0,Afghanistan,8,AFG,Asia,0.017488
1,Albania,5,ALB,Europe,0.010930
2,Algeria,13,DZA,Africa,0.028418
3,Angola,3,AGO,Africa,0.006558
4,Argentina,252,ARG,Americas,0.550868
...,...,...,...,...,...
101,United States,20587,USA,Americas,45.002842
102,Uruguay,18,URY,Americas,0.039348
103,Venezuela,19,VEN,Americas,0.041534
104,Vietnam,10,VNM,Asia,0.021860


In [30]:
from plotly.offline import init_notebook_mode, iplot

init_notebook_mode(connected=True)  
fig = px.scatter_geo(movie_country_count, locations="iso_alpha", hover_name="country", size="count",
                     color="continent", hover_data={'continent':False, 'iso_alpha':False, 'proportion': True},
                     projection="natural earth")

fig.update_layout(
        hoverlabel_align = 'auto',
        title = 'Geographic Distribution of Movie Data'
    )

fig.show()

## Language

In [64]:
us_data = movies_data[movies_data['production_countries'] == "[{'iso_3166_1': 'US', 'name': 'United States of America'}]"]

In [65]:
language_data = us_data.groupby('original_language').count().reset_index()[['original_language', 'id']]
language_data.describe()
language_data.rename(columns={"id": "count"}, inplace=True)

In [66]:
language_data

Unnamed: 0,original_language,count
0,ab,3
1,cs,1
2,da,2
3,de,32
4,en,17126
5,eo,1
6,es,21
7,fr,72
8,he,1
9,it,15


Using the [ISO Language Codes](https://datahub.io/core/language-codes) to convert the codes into the English name for the languages.

In [67]:
language_codes = pd.read_csv('Datasets/language-codes_csv.csv')

In [68]:
language_codes['original_language'] = language_codes['alpha2']
language_codes = language_codes[['original_language', 'English']]
language_codes

Unnamed: 0,original_language,English
0,aa,Afar
1,ab,Abkhazian
2,ae,Avestan
3,af,Afrikaans
4,ak,Akan
...,...,...
179,yi,Yiddish
180,yo,Yoruba
181,za,Zhuang; Chuang
182,zh,Chinese


In [69]:
language_data = language_data.merge(language_codes, on="original_language")
language_data.loc[language_data['count'] < 10, 'English'] = 'Other Language'
language_data

Unnamed: 0,original_language,count,English
0,ab,3,Other Language
1,cs,1,Other Language
2,da,2,Other Language
3,de,32,German
4,en,17126,English
5,eo,1,Other Language
6,es,21,Spanish; Castilian
7,fr,72,French
8,he,1,Other Language
9,it,15,Italian


In [70]:
fig = px.pie(language_data, values='count', names='English', title='Movie Languages in the US')
fig.show()

In [59]:
world_data = movies_data[movies_data['production_countries'] != "[{'iso_3166_1': 'US', 'name': 'United States of America'}]"]
language_data = world_data.groupby('original_language').count().reset_index()[['original_language', 'id']]
language_data.describe()
language_data.rename(columns={"id": "count"}, inplace=True)

In [60]:
language_data = language_data.merge(language_codes, on="original_language")
language_data.loc[language_data['count'] < 400, 'English'] = 'Other Language'
language_data

Unnamed: 0,original_language,count,English
0,ab,7,Other Language
1,af,2,Other Language
2,am,2,Other Language
3,ar,39,Other Language
4,ay,1,Other Language
...,...,...,...
80,uz,1,Other Language
81,vi,10,Other Language
82,wo,5,Other Language
83,zh,406,Chinese


In [62]:
fig = px.pie(language_data, values='count', names='English', title='Movie Languages (Excluding the US)')
fig.show()