## Scraping

Code for scraping that was used is given below. In this example we scrap through the whole `df_meta` but in our code we only we scraped for the movies we know we will keep, that is from the dataframe `df_clean`.

In [5]:
import requests
from bs4 import BeautifulSoup
import pywikibot
from pywikibot import *

def scraping(wiki_ID):
    list = []

    query_base = "https://query.wikidata.org/bigdata/namespace/wdq/sparql?format=json&query="

    wiki_ID_encoded = wiki_ID.replace("/","%2F")

    pre_str = ( "https://query.wikidata.org/bigdata/namespace/wdq/sparql?format=json&query=PREFIX%20wd%3A%20%3Chttp%3A%2F%2" +
        "Fwww.wikidata.org%2Fentity%2F%3E%0APREFIX%20wdt%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fprop%2Fdirect%2F%3E%0APREF" +
        "IX%20wikibase%3A%20%3Chttp%3A%2F%2Fwikiba.se%2Fontology%23%3E%0A%0ASELECT%20%20%3Fs%20%3FsLabel%20%3Fp%20%20%3Fo%2" +
        "0%3FoLabel%20WHERE%20%7B%0A%20%3Fs%20wdt%3AP646%20%22" )
    post_str = ( "%22%20%0A%0A%20%20%20SERVICE%20wikibase%3Alabel%20%7B%0A%20%20%20%20bd%3AserviceParam%20wikibase%3Alangua" +
        "ge%20%22en%22%20.%0A%20%20%20%7D%0A%20%7D" )

    query = pre_str + wiki_ID_encoded + post_str

    response = requests.get(query)
    soup = BeautifulSoup(response.content, "html.parser")

    results = soup.find(id="results")

    try:
        entity_wiki_id = response.json()['results']['bindings'][0]['s']['value']
    except:
        print("Scraping failed")
        return [wiki_ID,None,None]
        

    entity_wiki_id = response.json()['results']['bindings'][0]['s']['value']

    str = entity_wiki_id.split('/')
    entity = str[-1]
    print(entity)
    site = pywikibot.Site("wikidata", "wikidata")
    repo = site.data_repository()
    item = pywikibot.ItemPage(repo, entity)
    item_dict = item.get()
    clm_dict = item_dict["claims"] # Get the claim dictionary

    #add wiki_ID to list
    list.append(wiki_ID)

    #P57 for a movie director
    try:
        clm_list = clm_dict["P57"]
        for clm in clm_list:
                ...
                clm_trgt = clm.getTarget()
        list.append(clm_trgt.labels['en'])
    except:
        list.append(None)

    #P462 for the color
    try:
        clm_list = clm_dict["P462"]
        for clm in clm_list:
                ...
                clm_trgt = clm.getTarget()
        list.append(clm_trgt.labels['en'])
    except:
        list.append(None)

    print('scraped sucessfuly')
    return list

In [6]:
import pandas as pd
import numpy as np

df_meta = pd.read_csv("data/MovieSummaries/movie.metadata.tsv", sep='\t', header=None, 
    names=["wikiID", "freeID", "name", "release_date", "revenue", "runtime", "languages", "countries", "genres"])

In [None]:
sub_df = df_meta[0:10]

list_scrap = [[]]
for index, row in sub_df.iterrows():
    list_scrap.append(scraping(row['freeID']))

In [None]:
df_scrapped = pd.DataFrame (list_scrap, columns = ['freeID','director', 'color'])
df_scrapped.to_csv('./scrapped/scrapped.csv', encoding='utf-8')