# Gathering Movie Data

### Method: Wikipedia API
It appears that Wikipedia would be a good source to scrape information from. 
Wikipedia permits scraping for non-commerical "Creative Commons Share-Alike" ventures: see https://en.wikipedia.org/wiki/Wikipedia:Reusing_Wikipedia_content and their public API: https://pypi.org/project/wikipedia/

In [21]:
# you must install the following: 
# pip install wikipedia
# pip install wikipedia_sections
import wikipedia

### api exploration 

In [26]:
films_1940 = wikipedia.page("1940 in film")
films_1940.content[:100]

'The year 1940 in film involved some significant events, including the premieres of the Walt Disney f'

In [None]:
films_1940.sections # warning: this will be blank if you did not install wikipedia-sections

In [None]:
films_1940.section('A')

In [None]:
films_1940.section('Z')

### Movie Title Dataset Creation

In [73]:
# this method of scraping only works until 1972
years = range(1940, 1973, 1) 
title = lambda y : f"{y} in film"
Alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

In [74]:
title_dict = {}

for year in years: 
    page = wikipedia.page(title(year))    
    list_movies = []
    for C in Alphabet: 
        text = page.section(C)
        
        if text is not None: 
            section_movies = text.split('\n')
            section_movies = [line.split(',')[0] for line in section_movies]
            list_movies += section_movies
            
    title_dict[year] = list_movies

In [75]:
years = range(1974, 2024, 1) 

In [76]:
import IPython.display as display

current = 0
total = len(years)
for year in years: 
    print(year)
    title_dict[year] = []
    
    # display progress
    current += 1
    display.clear_output(wait=True)
    progress = '{0:.2f}'.format(current*100/total)
    display.display(f"Progress: {progress}%")

    dfs = []
    try:
        html = requests.get(f"https://en.wikipedia.org/wiki/List_of_American_films_of_{year}").text
        dfs = pd.read_html(html)
    except Exception as e: 
        print(e)
        continue 
    
    for df in dfs:
        if('Title' in df.columns):
            title_dict[year] += list(df['Title'])

'Progress: 100.00%'

  dfs = pd.read_html(html)


In [77]:
title_dict

{1940: ['Abe Lincoln in Illinois',
  'All This and Heaven Too',
  'Andy Hardy Meets Debutante',
  'Angels Over Broadway',
  'Anne of Windy Poplars',
  'The Ape',
  'Arise',
  'Arizona',
  'Band Waggon',
  'The Bank Dick',
  'Beating Heart',
  'Before I Hang',
  'The Biscuit Eater',
  'Bismarck',
  'Black Friday',
  'The Blue Bird',
  'Boom Town',
  'The Boys from Syracuse',
  'Boys of the City',
  'Brigham Young',
  'British Intelligence',
  'Broadway Melody of 1940',
  'Brother Orchid',
  'Calling Philo Vance',
  'Castle on the Hudson',
  'The Catacombs (Katakomby)',
  'Charlie Chan at the Wax Museum',
  'Charlie Chan in Panama',
  'Christmas in July',
  'A Chump at Oxford',
  'City for Conquest',
  'Confucius – (China)',
  'Contraband',
  'Convicted Woman',
  'Convoy – directed by Pen Tennyson',
  'Crimes at the Dark House',
  'Dance',
  'Dark Command',
  "Dead Man's Shoes",
  'The Devil Bat',
  "A Dispatch from Reuter's",
  'Dívka v modrém – (Czechoslovakia)',
  'Doomed to Die',
  '

In [89]:
df_dict = {}
df_dict["Title"] = []
df_dict["Year"] = []
df_dict["Content"] = []

for year, titles in title_dict.items():
    
    display.clear_output(wait=True)
    display.display(f"Year: {year}")
    
    for title in titles: 
        
        # remove film that has no wiki page
        film = None
        try:
            film = wikipedia.page(title)
        except: 
            continue; 
            
        df_dict["Title"].append(title)
        df_dict["Year"].append(year)
        df_dict["Content"].append(film.summary)

'Year: 2023'

In [90]:
pd.DataFrame.from_dict(df_dict)

Unnamed: 0,Title,Year,Content
0,Abe Lincoln in Illinois,1940,Abe Lincoln in Illinois is a 1940 biographical...
1,All This and Heaven Too,1940,"All This, and Heaven Too is a 1940 American dr..."
2,Andy Hardy Meets Debutante,1940,Andy Hardy Meets Debutante is a 1940 American ...
3,Angels Over Broadway,1940,Angels Over Broadway (also called Before I Die...
4,Anne of Windy Poplars,1940,Anne of Windy Poplars—published as Anne of Win...
...,...,...,...
12263,The Iron Claw,2023,The Iron Claw is a 2023 biographical sports dr...
12264,Memory,2023,Memory is the faculty of the mind by which dat...
12265,The Boys in the Boat,2023,The Boys in the Boat: Nine Americans and Their...
12266,Ferrari,2023,Ferrari S.p.A. (; Italian: [ferˈraːri]) is an ...


In [91]:
df = pd.DataFrame.from_dict(df_dict)

In [92]:
df.to_csv('movies.csv', index=False) 