In [75]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import time
import random

from src.data.transform_data import* 
from src.models.movie_success_model import* 
from src.models.actor_success_model import*
from src.utils.plot_graphs import*

df = pd.read_csv('data/movie_stats.csv')

In [76]:
raw_df = raw_data()
raw_df.to_csv('data/final_merged_data.csv', index=False)
raw_df.head()

Unnamed: 0,Freebase movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie languages,Movie countries,Movie genres,Review score,Movie votes,Movie director,Movie star,Movie budget,Movie gross,Movie company,Number of nomination,Nomination winner
0,/m/03vyhn,ghosts_of_mars,2001,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}",United States of America,"Thriller, Science Fiction, Horror, Adventure, ...",4.9,52000.0,John Carpenter,Natasha Henstridge,28000000.0,14010832.0,Screen Gems,,
1,/m/08yl5d,getting_away_with_murder:_the_jonbenét_ramsey_...,2000,,95.0,"{""/m/02h40lc"": ""English Language""}",United States of America,"Mystery, Biographical film, Drama, Crime Drama",,,,,,,,,
2,/m/0crgdbh,brun_bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}",Norway,"Crime Fiction, Drama",,,,,,,,,
3,/m/0285_cd,white_of_the_eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}",United Kingdom,"Thriller, Erotic thriller, Psychological thriller",6.2,2200.0,Donald Cammell,David Keith,,,Mrs. White's Productions,,
4,/m/01mrr1,a_woman_in_flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}",Germany,Drama,,,,,,,,,


In [77]:
df = clean_data(raw_df)

In [78]:
movie_success_index(df)

In [79]:
actor_df = actor_data(df)
actor_df = actor_success_index(actor_df)

In [80]:
actor_df.rename_axis('actor_name')

Unnamed: 0_level_0,Cumulative Score,Actor Score Index
actor_name,Unnamed: 1_level_1,Unnamed: 2_level_1
denzel_washington,3.327946,10.000000
matt_damon,3.302120,9.912690
tom_hanks,3.251497,9.741555
eddie_murphy,3.154919,9.415060
tom_cruise,3.139299,9.362255
...,...,...
j._kenneth_campbell,0.581335,0.714754
evelyn_keyes,0.577251,0.700949
james_dixon,0.577251,0.700949
reilly_murphy,0.444397,0.251820


In [81]:
def cap_surnames(name):
    name = name.replace('_', ' ')
    parts = name.split() # Split name and surname into parts
    
     # Process each part of the name
    for i in range(len(parts)):
        part = parts[i].lower()

        # Remove nicknames in single quotes
        if part.startswith("'") and part.endswith("'"):
            parts[i] = ''  # Mark the nickname for removal
            continue

        # Handle "McSomething"
        if part.startswith('mc'):
            parts[i] = part[:2].capitalize() + part[2:].capitalize()

        # Handle "MacSomething"
        elif part.startswith('mac'):
            parts[i] = part[:3].capitalize() + part[3:].capitalize()

        # Handle names with apostrophes, e.g., O'Something
        elif "'" in part:
            subparts = part.split("'")
            parts[i] = "%27".join([sub.capitalize() for sub in subparts])

        # Handle names with hyphens, e.g., Jean-Claude
        elif '-' in part:
            subparts = part.split('-')
            parts[i] = '-'.join([sub.capitalize() for sub in subparts])

        # Handle "von" (do not capitalize "von")
        elif part == 'von':
            parts[i] = 'von'
        
        # Handle Leo
        elif part == 'dicaprio':
            parts[i] = 'DiCaprio'

        # Default capitalization for other parts
        else:
            parts[i] = part.capitalize()

    # Remove empty parts (e.g., nicknames marked as '')
    parts = [part for part in parts if part]

    return ' '.join(parts)
        

In [82]:
actor_df.index = actor_df.index.map(cap_surnames)
character_data = actor_df[:50]

In [94]:
# Define the scraping function
def fetch_wikipedia_data(actor_name):
    """
    Fetch actor data from Wikipedia.
    """
    data = {'University': None, 'Theater': None, 'Sports': None, 'Birth City': None}
    try:
        # Construct the Wikipedia URL
        url = f"https://en.wikipedia.org/wiki/{actor_name.replace(' ', '_')}"
        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
        
        # Check for a successful request
        if response.status_code != 200:
            print(f"Failed to fetch data for {actor_name}: HTTP {response.status_code}")
            return data
        
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Locate the infobox
        infobox = soup.find('table', class_='infobox')
        if infobox:
            rows = infobox.find_all('tr')
            for row in rows:
                header = row.find('th')
                cell = row.find('td')
                if not header or not cell:
                    continue
                
                header_text = header.text.strip()
                cell_text = cell.text.strip()
                
                # University
                if 'Alma mater' in header_text or 'Alma\xa0mater' in header_text or 'Education' in header_text:
                    
                    data['University'] = cell_text

                # Birth City
                if 'Born' in header_text:
                    birthplace = cell.find('div', class_='birthplace')
                    if birthplace:
                        data['Birth City'] = birthplace.text.strip()
                    else:
                        # Check for text after <br>
                        br_tag = cell.find('br')
                        if br_tag:
                            # If the next sibling is an HTML tag, extract its text
                            if br_tag.next_sibling and br_tag.next_sibling.name == 'a':
                                data['Birth City'] = br_tag.next_sibling.text.strip()
                            # If it's just a string, extract it directly
                            elif br_tag.next_sibling and isinstance(br_tag.next_sibling, str):
                                data['Birth City'] = br_tag.next_sibling.strip()
                    
        # Sport & Theater
        early_life = soup.find('h2', {'id': lambda x: x and 'early_life' in x.lower()})  # Look for an "Early life" section
        if early_life:
            section = early_life.find_parent()
            sports_keywords = ['soccer', 'football', 'basketball', 'baseball', 'tennis', 'track', 'swimming', 'martial arts', 'ballet', 'dance']
            theater_keywords = ['theater', 'theatre']
                    # Iterate through all siblings in the section until we encounter the next section header
            for sibling in section.find_next_siblings():
                if sibling.name in ['div', 'h2', 'h3']:  # Reached the next section
                    break
                if sibling.name == 'p':  # Check paragraphs within the section
                    paragraph_text = sibling.text.lower()
                    for keyword in sports_keywords:
                        if keyword in paragraph_text:
                            data['Sports'] = keyword.capitalize()
                            break  # Stop searching once we find a match
                    for keyword in theater_keywords:
                        if keyword in paragraph_text:
                            data['Theater'] = 'Yes'
                            break

        return data
    except Exception as e:
        print(f"Error fetching data for {actor_name}: {e}")
        return {
            'Height': 'Not available',
            'University': 'Not available',
            'Theater': 'Not available',
            'Sports': 'Not available',
            'Birth City': 'Not available'}

In [95]:
chris = fetch_wikipedia_data('Chris Rock')
print(chris)

{'University': None, 'Theater': None, 'Sports': None, 'Birth City': 'Andrews, South Carolina'}


In [93]:
# Add new columns to the dataframe
character_data['University'] = None
character_data['Theater'] = None
character_data['Sports'] = None
character_data['Birth City'] = None

# Main loop for scraping
for idx, row in tqdm(character_data.iterrows(), total=len(character_data)):
    actor_name = row.name  # Adjust column name as per your dataset
    actor_data = fetch_wikipedia_data(actor_name)
    # Update the dataframe with the fetched data
    character_data.at[idx, 'University'] = actor_data['University']
    character_data.at[idx, 'Theater'] = actor_data['Theater']
    character_data.at[idx, 'Sports'] = actor_data['Sports']
    character_data.at[idx, 'Birth City'] = actor_data['Birth City']

    # Respectful scraping: Introduce delay
    time.sleep(random.uniform(1, 3))

# Save the updated dataframe
character_data.to_csv('actors_updated.csv', index=False)
print("Scraping completed. Updated data saved to actors_updated.csv.")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  character_data['University'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  character_data['Theater'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  character_data['Sports'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexe

Scraping completed. Updated data saved to actors_updated.csv.



