# Data scraping from wikidata

In [2]:
import os
while os.path.basename(os.getcwd()) != "ada-project-private":
    os.chdir('..')

import pandas as pd
import json

DATA_FOLDER = './MovieSummaries/'

# Load character.metadata.tsv
character_metadata_cols = ["Wikipedia movie ID", "Freebase movie ID",
                           "Movie release date", "Character name",
                           "Actor date of birth", "Actor gender", "Actor height (in meters)", "Actor ethnicity (Freebase ID)",
                           "Actor name", "Actor age at movie release",
                           "Freebase character/actor map ID", "Freebase character ID", "Freebase actor ID"]
character_metadata_df = pd.read_csv(DATA_FOLDER + 'character.metadata.tsv', sep='\t', header=None, names=character_metadata_cols)

# Load movie.metadata.tsv
movie_metadata_cols = ["Wikipedia movie ID", "Freebase movie ID", "Movie name", "Movie release date", "Movie box office revenue", "Movie runtime",
                       "Movie languages (Freebase ID:name tuples)", "Movie countries (Freebase ID:name tuples)",
                       "Movie genres (Freebase ID:name tuples)"]
movie_metadata_df = pd.read_csv(DATA_FOLDER + 'movie.metadata.tsv', sep='\t', header=None, names=movie_metadata_cols)

# Load name.clusters.txt
name_clusters_cols = ["Character name", "Freebase movie ID"]
name_clusters_df = pd.read_csv(DATA_FOLDER + 'name.clusters.txt', sep='\t', header=None, names=name_clusters_cols)

# Load plot_summaries.txt
plot_summaries_cols = ["Wikipedia movie ID", "Wikipedia plot"]
plot_summaries_df = pd.read_csv(DATA_FOLDER + 'plot_summaries.txt', sep='\t', header=None, names=plot_summaries_cols)

# Load tvtropes.clusters.txt
tvtropes_clusters_cols = ["Character types", "details"]
tvtropes_clusters_df = pd.read_csv(DATA_FOLDER + 'tvtropes.clusters.txt', sep='\t', header=None, names=tvtropes_clusters_cols)

tvtropes_clusters_df["details_dict"] = tvtropes_clusters_df["details"].apply(json.loads)
tvtropes_clusters_df["Character name"] = tvtropes_clusters_df["details_dict"].apply(lambda x: x.get('char'))
tvtropes_clusters_df["Movie name"] = tvtropes_clusters_df["details_dict"].apply(lambda x: x.get('movie'))
tvtropes_clusters_df["Freebase character/actor map ID"] = tvtropes_clusters_df["details_dict"].apply(lambda x: x.get('id'))
tvtropes_clusters_df["Actor name"] = tvtropes_clusters_df["details_dict"].apply(lambda x: x.get('actor'))

tvtropes_clusters_df.drop(columns=["details", "details_dict"], inplace=True)



SAVE_PATH = "DataScraping/Wikidata/Data/"

In [3]:
import requests

def download_wikidata_sparql_csv(query, file_path):
    url = 'https://query.wikidata.org/sparql'
    
    response = requests.get(url, params={'query': query}, headers={'Accept': 'text/csv'})
    
    if response.status_code == 200:
        with open(file_path, 'wb') as file:
            file.write(response.content)
        return True
    else:
        print(f"Error: Failed to retrieve data. Status code {response.status_code}")
        return False

#### Movie WikidataID - Movie FreebaseID - duration

In [8]:
generic_query = '''
SELECT ?s ?sLabel ?freebaseID ?duration
WHERE {
  VALUES ?freebaseID {
|
  }
  
  ?s wdt:P646 ?freebaseID .
  
  OPTIONAL { ?s wdt:P2047 ?duration }
  
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
}
'''

n = len(movie_metadata_df)
step = 100

low = 0
c = 1
while low < n:
    up = min(low + step, n)

    ids_string = "\n".join(movie_metadata_df["Freebase movie ID"][low:up].astype(str).apply(lambda x: "\"" + x + "\""))
    query = generic_query.replace("|",ids_string)
    
    success = download_sparql_csv(query, SAVE_PATH + f"tempData/duration_{c:04d}.csv")
    
    if success:
        low = up
        c += 1
        print(f"{up}/{n} done")
    else:
        print(f"{low} - {up} to redo")

durations_df = pd.concat([pd.read_csv(f"AdditionalData/duration_{i:04d}.csv") for i in range(1,c)], ignore_index=True)

In [13]:
import re

pattern = r'^Q\d+$'
def merge_labels(labels):
    n = len(labels)
    for i in range(n):
        if not re.match(pattern, labels.iloc[i]):
            return labels.iloc[i]
    return labels.iloc[n-1]

def first(l):
    return l.iloc[0]

def max_skip_na(l):
    return l.dropna().max()

result = durations_df.groupby('freebaseID').agg({
    's': first,
    'sLabel': merge_labels,
    'duration': max_skip_na
})

result = result.reset_index()

result.to_csv(SAVE_PATH + "movie_duration_from_wikidata.csv", index=False)

#### FreebaseID - Ethnicity

In [4]:
character_metadata_df["Actor ethnicity (Freebase ID)"].unique()[1:]

generic_query = '''
SELECT ?s ?sLabel ?freebaseID
WHERE {
  
  VALUES ?freebaseID {
|
  }
  
  ?s wdt:P646 ?freebaseID .
  
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
  
}
'''

ids_string = "\n".join(["\"" + x + "\"" for x in character_metadata_df["Actor ethnicity (Freebase ID)"].unique()[1:].astype(str)])

query = generic_query.replace('|', ids_string)

print(query)


SELECT ?s ?sLabel ?freebaseID
WHERE {
  
  VALUES ?freebaseID {
"/m/044038p"
"/m/0x67"
"/m/064b9n"
"/m/041rx"
"/m/033tf_"
"/m/04gfy7"
"/m/0222qb"
"/m/01qhm_"
"/m/0dryh9k"
"/m/048sp5"
"/m/04mvp8"
"/m/0bzkm2"
"/m/02p1pl6"
"/m/0bjbszh"
"/m/022fdt"
"/m/0cqgdq"
"/m/0ffkb4"
"/m/075dhf0"
"/m/01hwt"
"/m/0xnvg"
"/m/0dqqwy"
"/m/048z7l"
"/m/07bch9"
"/m/09v5bdn"
"/m/02w7gg"
"/m/03bkbh"
"/m/02vsw1"
"/m/09kr66"
"/m/09vc4s"
"/m/0g0x7_"
"/m/042gtr"
"/m/0cm7w1"
"/m/046cwm"
"/m/04dbw3"
"/m/02ctzb"
"/m/0g8_vp"
"/m/092h2qt"
"/m/0g6ff"
"/m/0278pqj"
"/m/0301y_"
"/m/019kn7"
"/m/0cnvdq1"
"/m/03295l"
"/m/065b6q"
"/m/03pqwy"
"/m/01xhh5"
"/m/03ts0c"
"/m/06gbnc"
"/m/07hwkr"
"/m/0bpjh3"
"/m/0fpjs3j"
"/m/04nrnz"
"/m/09k5jvk"
"/m/07mqps"
"/m/08hpk0"
"/m/03ttfc"
"/m/0d9q7j"
"/m/075_n6"
"/m/0dllcfn"
"/m/04kbvpz"
"/m/03ftx7"
"/m/0747611"
"/m/025rpb0"
"/m/06mvq"
"/m/047l_90"
"/m/029f2r"
"/m/01rv7x"
"/m/05sf2x"
"/m/01336l"
"/m/0bh91q8"
"/m/01g7zj"
"/m/0cn68"
"/m/02sch9"
"/m/0fqp6zk"
"/m/02y_9mh"
"/m/0d7wh"
"/m/0g96wd"
"

#### Movie - cast member - role

In [10]:
generic_query = '''
SELECT ?s ?sLabel ?freebaseID ?castMember ?castMemberLabel ?character ?characterLabel ?roleLabel
WHERE {
  
  VALUES ?freebaseID { 
|
  }
  
  ?s wdt:P646 ?freebaseID .

  OPTIONAL { 
    ?s p:P161 ?castStatement .
    ?castStatement ps:P161 ?castMember .
    OPTIONAL { ?castStatement pq:P453 ?character }
    OPTIONAL { ?castStatement pq:P4633 ?role }
  }
  
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
  
}
'''

n = len(movie_metadata_df)
step = 100

low = 0
c = 1
while low < n:
    up = min(low + step, n)

    ids_string = "\n".join(movie_metadata_df["Freebase movie ID"][low:up].astype(str).apply(lambda x: "\"" + x + "\""))
    query = generic_query.replace("|", ids_string)
    
    success = download_sparql_csv(query, SAVE_PATH + f"tempData/actors_roles_{c:04d}.csv")
    
    if success:
        low = up
        c += 1
        print(f"{up}/{n} done")
    else:
        print(f"{low} - {up} to redo")

pd.concat([pd.read_csv(f"tempData/actors_roles_{i:04d}.csv") for i in range(1,c)], ignore_index=True).to_csv(SAVE_PATH + "actors_roles_from_wikidata.csv", index=False)

100/81741 done
200/81741 done
300/81741 done
400/81741 done
500/81741 done
600/81741 done
700/81741 done
800/81741 done
900/81741 done
1000/81741 done
1100/81741 done
1200/81741 done
1300/81741 done
1400/81741 done
1500/81741 done
1600/81741 done
1700/81741 done
1800/81741 done
1900/81741 done
2000/81741 done
2100/81741 done
2200/81741 done
2300/81741 done
2400/81741 done
2500/81741 done
2600/81741 done
2700/81741 done
2800/81741 done
2900/81741 done
3000/81741 done
3100/81741 done
3200/81741 done
3300/81741 done
3400/81741 done
Error: Failed to retrieve data. Status code 429
3400 - 3500 to redo
3500/81741 done
3600/81741 done
3700/81741 done
3800/81741 done
3900/81741 done
4000/81741 done
4100/81741 done
4200/81741 done
4300/81741 done
4400/81741 done
4500/81741 done
4600/81741 done
4700/81741 done
4800/81741 done
4900/81741 done
5000/81741 done
5100/81741 done
5200/81741 done
5300/81741 done
5400/81741 done
5500/81741 done
5600/81741 done
5700/81741 done
5800/81741 done
5900/81741 do

FileNotFoundError: [Errno 2] No such file or directory: 'AdditionalData/actors_roles_0001.csv'