In [56]:
from datetime import datetime, date, time
import os 
import tarfile
import json
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
from matplotlib import pyplot as plt

In [57]:
#P172: wikidata ID for ethnicity

WIKIDATA_QUERY = """
SELECT DISTINCT ?item ?itemLabel WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". }
  {
    SELECT DISTINCT ?item WHERE {
      ?item p:P646 ?statement0.
      ?statement0 (ps:P646) "%s".
    }
    LIMIT 100
  }
}

"""

In [3]:
r = requests.get('https://www.wikidata.org/w/index.php?search=&search=&title=Special%3ASearch&go=Go/html') 
soup = BeautifulSoup(r.text, 'html.parser')
soup.h2

<h2>Navigation menu</h2>

In [4]:
from wikidata.client import Client

client = Client()  # doctest: +SKIP
entity = client.get('Q49085', load=True)
entity.label

m'African Americans'

In [6]:
#extract tar data
data_folder_tar = './data/'
data_path_tar = os.path.join(data_folder_tar, 'MovieSummaries.tar.gz')

In [7]:
# open file
file = tarfile.open(data_path_tar)
  
# extract files
file.extractall('./data')
  
# close file
file.close()

In [8]:
Name_movie = ['Wikipedia movie ID', 'Freebase movie ID', 'Movie name', 'Movie release date', 'Movie box office revenue', 'Movie runtime', 'Movie languages', 'Movie countries', 'Movie genres']

In [9]:
#read different datas as dataframes
data_folder = './data/'
data_path = os.path.join(data_folder,'movie.metadata.tsv')
df_movie = pd.read_csv(data_path, sep ='\t', header = None, names = Name_movie, na_values = ['{}', ' '], lineterminator='\n')


In [11]:
df_movie.head()

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie languages,Movie countries,Movie genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


In [10]:
#Filtering
def filtering_function(column):
    _filtered = []
    i = 0
    for s in column:
        S_r = ''
        c = 1
        if (isinstance(s, str)):
            if s.find(',') == -1:
                S_r = s[(s.find(':')+3):(s.find(':}')-1)]
            else:
                a = s
                S_r = a[(a.find(':')+3):(a.find(',')-1)]
                a = a[(a.find(',')+1):]
                while c>0:
                    if a.find(',') == -1:
                        S_r =  S_r + ', ' + a[(a.find(':')+3):(a.find(':}')-1)]
                        c = 0
                    else:
                        S_r = S_r + ', ' + a[(a.find(':')+3):(a.find(',')-1)]
                        a = a[(a.find(',')+1):]
        else:
            S_r = 'None' 
        _filtered.append(S_r)
    return _filtered

#Language Filtering
language_filtered = filtering_function(df_movie['Movie languages'])
df_movie['Movie languages filtered'] = language_filtered

#Countries Filtering
countries_filtered = filtering_function(df_movie['Movie countries'])
df_movie['Movie countries filtered'] = countries_filtered

#Genres Filtering
genres_filtered = filtering_function(df_movie['Movie genres'])
df_movie['Movie genres filtered'] = genres_filtered

In [11]:
#movie release date to just year

def date(df_date):
    dates = []
    len_year = 4
    for date in df_date:
        if not (pd.isna(date)):
            date = str(date)
            if len(date) > len_year:
                dates.append(date[0:4])
            else:
                dates.append(date)
        else:
            dates.append(date)
            
    return dates


In [12]:
df_movie["dates_filtered"] = date(df_movie["Movie release date"])

In [13]:
df_movie

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie languages,Movie countries,Movie genres,Movie languages filtered,Movie countries filtered,Movie genres filtered,dates_filtered
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",English Language,United States of America,"Thriller, Science Fiction, Horror, Adventure, ...",2001
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp...",English Language,United States of America,"Mystery, Biographical film, Drama, Crime Drama",2000
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D...",Norwegian Language,Norway,"Crime Fiction, Drama",1988
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...",English Language,United Kingdom,"Thriller, Erotic thriller, Psychological thriller",1987
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}",German Language,Germany,Drama,1983
...,...,...,...,...,...,...,...,...,...,...,...,...,...
81736,35228177,/m/0j7hxnt,Mermaids: The Body Found,2011-03-19,,120.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/07s9rl0"": ""Drama""}",English Language,United States of America,Drama,2011
81737,34980460,/m/0g4pl34,Knuckle,2011-01-21,,96.0,"{""/m/02h40lc"": ""English Language""}","{""/m/03rt9"": ""Ireland"", ""/m/07ssc"": ""United Ki...","{""/m/03bxz7"": ""Biographical film"", ""/m/07s9rl0...",English Language,"Ireland, United Kingdom","Biographical film, Drama, Documentary",2011
81738,9971909,/m/02pygw1,Another Nice Mess,1972-09-22,,66.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06nbt"": ""Satire"", ""/m/01z4y"": ""Comedy""}",English Language,United States of America,"Satire, Comedy",1972
81739,913762,/m/03pcrp,The Super Dimension Fortress Macross II: Lover...,1992-05-21,,150.0,"{""/m/03_9r"": ""Japanese Language""}","{""/m/03_3d"": ""Japan""}","{""/m/06n90"": ""Science Fiction"", ""/m/0gw5n2f"": ...",Japanese Language,Japan,"Science Fiction, Japanese Movies, Adventure, A...",1992


In [14]:
Name_character = ['Wikipedia Movie ID','Freebase Movie ID', 'Movie release date', 'Character Name','Actor DOB', 'Actor gender', 'Actor height', 'Actor ethnicity', 'Actor Name', 'Actor age at movie release', 'unknown 1', 'unknown 2', 'Freebase character map']

In [15]:
data_path = os.path.join(data_folder,'character.metadata.tsv')
df_character = pd.read_csv(data_path, sep ='\t', header = None, names = Name_character, na_values = ' ', lineterminator='\n')

In [18]:
df_character.head(10)

Unnamed: 0,Wikipedia Movie ID,Freebase Movie ID,Movie release date,Character Name,Actor DOB,Actor gender,Actor height,Actor ethnicity,Actor Name,Actor age at movie release,unknown 1,unknown 2,Freebase character map
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg
5,975900,/m/03vyhn,2001-08-24,Commander Helena Braddock,1949-05-26,F,1.727,/m/0x67,Pam Grier,52.0,/m/02vdcfp,/m/0bgchnd,/m/0418ft
6,975900,/m/03vyhn,2001-08-24,Whitlock,1945-08-02,F,1.753,,Joanna Cassidy,56.0,/m/02vd6kw,/m/0bgchmx,/m/06lj1m
7,975900,/m/03vyhn,2001-08-24,Big Daddy Mars,,M,,,Richard Cetrone,,/m/0bgchsy,/m/0bgcht0,/m/0bgcht7
8,975900,/m/03vyhn,2001-08-24,Michael Descanso,1971-03-20,M,1.892,,Liam Waite,30.0,/m/03jqhb0,/m/0bgchs4,/m/0ks8b0
9,975900,/m/03vyhn,2001-08-24,Uno,,M,,,Duane Davis,,/m/0bgchtj,/m/0bgchtm,/m/03nrwdy


In [17]:
df_character["dates_filtered"] = date(df_character["Movie release date"])
df_character["dates_filtered_DOB"] = date(df_character["Actor DOB"])
df_character

Unnamed: 0,Wikipedia Movie ID,Freebase Movie ID,Movie release date,Character Name,Actor DOB,Actor gender,Actor height,Actor ethnicity,Actor Name,Actor age at movie release,unknown 1,unknown 2,Freebase character map,dates_filtered,dates_filtered_DOB
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.620,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7,2001,1958
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.780,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4,2001,1974
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l,2001,1969
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.750,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc,2001,1967
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.650,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg,2001,1977
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450664,913762,/m/03pcrp,1992-05-21,Elensh,1970-05,F,,,Dorothy Elias-Fahn,,/m/0kr406c,/m/0kr406h,/m/0b_vcv,1992,1970
450665,913762,/m/03pcrp,1992-05-21,Hibiki,1965-04-12,M,,,Jonathan Fahn,27.0,/m/0kr405_,/m/0kr4090,/m/0bx7_j,1992,1965
450666,28308153,/m/0cp05t9,1957,,1941-11-18,M,1.730,/m/02w7gg,David Hemmings,15.0,/m/0g8ngmc,,/m/022g44,1957,1941
450667,28308153,/m/0cp05t9,1957,,,,,,Roberta Paterson,,/m/0g8ngmj,,/m/0g8ngmm,1957,


In [58]:
temp_df = df_character['Actor ethnicity'].groupby(df_character['Actor ethnicity']).agg(['count'])
temp_df = temp_df.reset_index()


In [59]:
def get_ethnicity(freebase_id):
    # freebase -> wikidata
    query = WIKIDATA_QUERY % freebase_id
    req = requests.get("https://query.wikidata.org/sparql",
                       params={"format": "json", "query": query})
    try:
        data = req.json()
        wikidata_id = data["results"]["bindings"][0]["itemLabel"]["value"]
    except:
        return np.nan
    
    client = Client()
    entity = client.get(wikidata_id, load=True)
    
    return entity.label
    

In [60]:
temp_temp_df = temp_df.copy()
et = []
for ethnicity in temp_df['Actor ethnicity'] :
    ptint(get_ethnicity(ethnicity)
    et.append(get_ethnicity(ethnicity))

temp_temp_df['Actor ethnicity'] = et

Albanians
French Canadians
Ukrainians
Afrikaners
Asian Americans
Ashkenazi Jews
Sephardi Jews
Bosniaks
Czechs
Bulgarians
Germans
Han Chinese people
Manchu
Arab Americans
Anglo-Irish people
Chinese Indonesians
Ryukyuan people
Kikuyu
Japanese people
Copts
Swedish-speaking population of Finland
Inupiat people
Aymara
Zhuang people
Sherpa
Basque people
Indian diaspora
Tibetan people
Assyrian people
Poles
First Nations
Romanians
Latvians
Polish Americans
Danes
Chilean American
White Latin American
white people
Javanese people
Portuguese Americans
French-speaking Quebecer
Peruvians in the United Kingdom
European Americans
Kanyakubja Brahmins
Native Hawaiians
Chettiar
Greek American
British Pakistanis
Welsh Italians
Australian American
Scottish people
Finnish Americans
Yugoslavs
Spaniards
Haudenosaunee Confederacy
Afro-Cuban
Spanish American
Nair
Lebanese people in the United Kingdom
Slovene American
Chinese Filipino
Nepali Indian
Barbadian American
Cambodian Americans
Israeli Americans
Bhutia

In [69]:
for i in range (len(et)) :
    

Albanians


In [64]:
temp_temp_df['Actor ethnicity'] = et

In [75]:
et

[m'Albanians',
 m'French Canadians',
 m'Ukrainians',
 m'Afrikaners',
 m'Asian Americans',
 m'Ashkenazi Jews',
 m'Sephardi Jews',
 m'Bosniaks',
 m'Czechs',
 m'Bulgarians',
 m'Germans',
 m'Han Chinese people',
 m'Manchu',
 m'Arab Americans',
 m'Anglo-Irish people',
 m'Chinese Indonesians',
 m'Ryukyuan people',
 nan,
 nan,
 nan,
 m'Kikuyu',
 nan,
 m'Japanese people',
 nan,
 nan,
 nan,
 m'Copts',
 m'Swedish-speaking population of Finland',
 m'Inupiat people',
 nan,
 nan,
 m'Aymara',
 nan,
 m'Zhuang people',
 m'Sherpa',
 m'Basque people',
 nan,
 m'Indian diaspora',
 nan,
 nan,
 nan,
 nan,
 nan,
 m'Tibetan people',
 nan,
 nan,
 nan,
 m'Assyrian people',
 m'Poles',
 nan,
 nan,
 nan,
 nan,
 nan,
 m'First Nations',
 nan,
 m'Romanians',
 nan,
 nan,
 nan,
 m'Latvians',
 nan,
 nan,
 nan,
 nan,
 m'Polish Americans',
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 m'Danes',
 nan,
 nan,
 nan,
 m'Chilean American',
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 m'White Latin American',
 nan,
 nan,


In [74]:
temp_temp_df['Actor ethnicity'][0]

m'Albanians'