In [None]:
def get_movies_since(year):
    import pandas as pd
    year_data = pd.read_csv('year.csv', on_bad_lines='skip', sep="\t")
    movie_data = year_data[year_data["titleType"].isin(["movie", "tvMovie"])]
    movie_data_with_year = movie_data[movie_data["startYear"] >= "1990"]
    movie_data_with_valid_year = movie_data_with_year[movie_data_with_year["startYear"] != "\\N"]
    movie_data_with_valid_year = movie_data_with_valid_year[["tconst", "startYear", "originalTitle"]]

    imdb_data = pd.read_csv("ratings.csv", sep="\t")

    year_vote_data = pd.merge(imdb_data, movie_data_with_valid_year, on="tconst", how="right")
    year_vote_data = year_vote_data[["tconst","startYear","averageRating","numVotes", "originalTitle"]]
    year_vote_data.columns = ["title id","Year","Rating","Votes", "Title"]
    year_vote_data["rating link"] = "https://www.imdb.com/title/"+year_vote_data["title id"]+"/ratings/?ref_=tt_ov_rt"
    year_vote_data = year_vote_data.sort_values(by="title id").reset_index(drop=True)
    year_vote_data.to_csv("movies_since_"+"1990"+".csv", index=False)

    return year_vote_data
dataset = get_movies_since("1990")

## extracting vote data

In [26]:
def get_votes(link, vote_threshold=10000):
    import requests, re
    request_text = requests.get(link).text
    if "No Ratings Available" in request_text or "Well, what if there is no webpage?" in request_text or int(((request_text[request_text.index("IMDb users have given a ")-15:request_text.index("IMDb users have given a ")-1].strip()).replace(",",""))) < vote_threshold:
        return [0,0,0,0,0,0,0,0,0,0], 0.0

    star_votes = re.findall("<div class=\"leftAligned\">(.*?)</div>", request_text)
    rating = re.findall("span class=\"ipl-rating-star__rating\">(.*?)</span>", request_text)
    stars = [int(vote.replace(",","")) for vote in star_votes[1:11]]
    
    return stars, float(rating[0])

In [4]:
# 1 second for a link
stars, rating = get_votes("https://www.imdb.com/title/tt0068646/ratings/?ref_=tt_ov_rt", vote_threshold=10000)

In [38]:
def fill_dataset(dataset, start=0, end=0, zip=1):
    for i in range(start, end):
        try:
            stars, rating = get_votes(dataset.iloc[i]["rating link"])
            dataset.loc[i,["1","2","3","4","5","6","7","8","9","10"]] = stars
            dataset.loc[i,["Rating","Votes"]] = rating, sum(stars)
        except:
            pass
        if i % 100 == 0:
            print(i,". movie done between ",start,"-",end, sep="")

    if zip:
        print(end,". movie done between ",start,"-",end, sep="")
        dataset[:end].to_csv("movie_data_at_"+str(start)+"_"+str(end)+".csv", index=False)
    return dataset

In [None]:
import pandas as pd
dataset = pd.read_csv("movie_data_at_5000_6000.csv")
dataset = fill_dataset(dataset, start=6000, end=7000, zip=1)

## Merging movie and box office datasets

In [1]:
def merge_movies_with_boxoffices(movie_csv_path, how_to_merge):
    import pandas as pd
    dataset = pd.read_csv(movie_csv_path)
    dataset = dataset[~dataset["1"].isnull()]
    dataset = dataset[dataset["Votes"]>0]
    dataset["Title-Year"] = dataset["Year"].astype(str) + " " + dataset["Title"]

    movie_box_dataset = pd.read_csv("year_movie_worldwide_box.csv")
    movie_box_dataset.columns =  ["Year", "Title", "WorldwideBox Office"]
    movie_box_dataset["Title-Year"] = movie_box_dataset["Year"].astype(int).astype(str) + " " + movie_box_dataset["Title"]

    including_movies = movie_box_dataset[movie_box_dataset["Title-Year"].isin(list(dataset["Title-Year"]))]
    merge = pd.merge(dataset, including_movies[["Title-Year","WorldwideBox Office"]], on="Title-Year", how=how_to_merge)
    return merge

In [None]:
merge = merge_movies_with_boxoffices("informative_movie_data.csv", "outer")
merge.head(3)

# Getting revenues

In [None]:
def get_revenue(link):
    import requests
    import numpy as np
    text = requests.get(link).text
    if "<span class=\"ipc-metadata-list-item__label\">Gross worldwide</span>" in text:
        revenue_part = text[text.index("<span class=\"ipc-metadata-list-item__label\">Gross worldwide</span>"):]
        revenue = int(revenue_part[revenue_part.index("$")+1:].split("</span>")[0].replace(",",""))
        return revenue
    raise Exception("cant find worldwide gross")

In [None]:
def fill_revenues(dataset, start=0, end=0, zip=1):
    from datetime import datetime
    import requests

    for i in range(start, end):
        try:
            revenue = get_revenue(dataset.iloc[i]["imdb_url"])
            dataset.loc[i,"WorldwideBox Office"] = revenue
        except:
            pass
        if i % 100 == 0:
            print(datetime.now().strftime("%H:%M:%S"), " ..... ", i,". movie done between ",start,"-",end, sep="")
            if i%1000 == 0 and zip:
                dataset[:end].to_csv("movie_revenue_data_at_"+str(start)+"_"+str(end)+".csv", index=False)

    dataset.to_csv("movie_revenue_data_all.csv", index=False)
    return dataset

In [None]:
def get_revenue_from_wikipedia(title_year):
    import requests
    year, title = title_year[:title_year.index(" ")], title_year[title_year.index(" ")+1:]

    try:
        text = requests.get("https://en.wikipedia.org/wiki/"+title.replace(" ","_")).text
        a = text[text.index("Box office</th>"):]
        revenue = a[a.index("style=\"white-space: nowrap\">") + len("style=\"white-space: nowrap\">"): a.index("</span>")]
        return revenue
    except:
        pass
    try:
        text_film = requests.get("https://en.wikipedia.org/wiki/"+title.replace(" ","_")+"_(film)").text
        a = text_film[text_film.index("Box office</th>"):]
        revenue = a[a.index("style=\"white-space: nowrap\">") + len("style=\"white-space: nowrap\">"): a.index("</span>")]
        return revenue
    except:
        pass
    try:
        text_film_year = requests.get("https://en.wikipedia.org/wiki/"+title.replace(" ","_")+"_("+year+"_film)").text
        a = text_film_year[text_film_year.index("Box office</th>"):]
        revenue = a[a.index("style=\"white-space: nowrap\">") + len("style=\"white-space: nowrap\">"): a.index("</span>")]
        return revenue
    except:
        pass
    raise Exception("cant find worldwide gross")

In [None]:
def fill_revenues_from_wikipedia(dataset, indices, start=0, end=0, zip=1):
    from datetime import datetime
    import requests

    for i in indices:
        try:
            revenue = get_revenue_from_wikipedia(dataset.iloc[i]["Title-Year"])
            dataset.loc[i,"WorldwideBox Office"] = revenue
        except:
            pass
        if i % 100 == 0:
            print(datetime.now().strftime("%H:%M:%S"), " ..... ", i,". movie done between ",start,"-",end, sep="")
            if i%1000 == 0 and zip:
                dataset[:end].to_csv("movie_revenue_data_at_"+str(start)+"_"+str(end)+".csv", index=False)

    dataset.to_csv("movie_revenue_data_all.csv", index=False)
    return dataset

In [49]:
import pandas as pd
import numpy as np
data = pd.read_csv("movie_revenue_data_all.csv")
data = pd.DataFrame(data)
data['WorldwideBox Office'] = data['WorldwideBox Office'].astype('str')
data['WorldwideBox Office'] = data['WorldwideBox Office'].str.replace(',', '')
data['WorldwideBox Office'] = pd.to_numeric(data['WorldwideBox Office'], errors='coerce')

In [50]:
data['WorldwideBox Office']

0       76019048.0
1       71609321.0
2       21413502.0
3       33461269.0
4        7331647.0
           ...    
7620     4961424.0
7621     8399765.0
7622     1351662.0
7623      522938.0
7624     4588389.0
Name: WorldwideBox Office, Length: 7625, dtype: float64