In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [13]:
# URL of the Wikipedia page
url_0_C = "https://en.wikipedia.org/wiki/List_of_fiction_works_made_into_feature_films_(0%E2%80%939,_A%E2%80%93C)"
url_D_J = "https://en.wikipedia.org/wiki/List_of_fiction_works_made_into_feature_films_(D%E2%80%93J)"
url_K_R = "https://en.wikipedia.org/wiki/List_of_fiction_works_made_into_feature_films_(K%E2%80%93R)"
url_S_Z = "https://en.wikipedia.org/wiki/List_of_fiction_works_made_into_feature_films_(S%E2%80%93Z)"

urls = [url_0_C, url_D_J, url_K_R, url_S_Z]


In [14]:
def scrap_book_to_movie(url): 
    response = requests.get(url)
    result = []
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        content = soup.find('div', {'class': 'mw-parser-output'})
        tables = content.find_all('table', {'class': 'wikitable'})
        for table in tables:
            rows = table.find_all('tr')
            
            for row in rows:
                cells = row.find_all('td')

                # splits into book and movie
                cell_tab = [cell.get_text(strip=True) for cell in cells]
                result.append(cell_tab)    

    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")

    result = pd.DataFrame(result)
    result.columns = ['fiction_work', 'film_adaptations']
    return result
    

In [15]:
def extract_years(text):
    years = re.findall(r'\((?:[^)]*?)(\d{4}(?:[–-](?:\d{4}|present))?)(?:[^)]*?)\)', text)
    return years[0] if years else None

In [16]:
def extract_features(df):
    df['title_book'] = df['fiction_work'].str.split('(').str[0]
    df_split_comma = df['fiction_work'].str.split(',')
    df['author_book'] = df_split_comma.apply(lambda x: x[-1] if len(x) > 1 else None)
    df['no_author_test'] = df_split_comma.apply(lambda x: False if len(x) > 1 else True)
    df['year_book'] = df['fiction_work'].apply(extract_years)

    df['title_film'] = df['film_adaptations'].str.split('(').str[0]
    df['year_film'] = df['film_adaptations'].apply(extract_years)

    df = df.drop(['fiction_work', 'film_adaptations'], axis = 1)

    return df

In [17]:
def clean_same_as_above_below(df):
        
    indexes = df.index[df['title_film'] == 'same as above'].tolist()
    target_ind =[(index - 1) for index in indexes]
    df['title_film'][indexes] = df['title_film'][target_ind]
    df['year_film'][indexes] = df['year_film'][target_ind]

    indexes = df.index[df['title_film'] == 'same as below'].tolist()
    target_ind =[(index + 1) for index in indexes]
    df['title_film'][indexes] = df['title_film'][target_ind]
    df['year_film'][indexes] = df['year_film'][target_ind]

    return df

In [18]:
def scrap_post_processing(df): 
    df.loc[df['film_adaptations'].isnull() & df['fiction_work'].notnull(), ['film_adaptations']] = df['fiction_work']
    df.loc[df['film_adaptations'] == df['fiction_work'], ['fiction_work']] = None

    # fill nan fiction_work values with the last non null value of fiction_work
    df['fiction_work'] = df['fiction_work'].ffill()
    # drop nan where both columns are nan
    df = df.dropna(subset=['film_adaptations'])

    df = extract_features(df)

    df = clean_same_as_above_below(df)

    return df


In [19]:
dataframes = []
for url in urls: 
    df = scrap_book_to_movie(url)
    clean_df = scrap_post_processing(df)
    dataframes.append(clean_df)

book_adaptations = pd.concat(dataframes).reset_index(drop=True)
book_adaptations.to_csv('book_adaptations.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['title_book'] = df['fiction_work'].str.split('(').str[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['author_book'] = df_split_comma.apply(lambda x: x[-1] if len(x) > 1 else None)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['no_author_test'] = df_split_comma.apply(lambda x: False if 

In [21]:
#year_film = book_adaptations.copy()['year_film']
#print(book_adaptations[year_film.isnull()])

#year_book = book_adaptations.copy()['author_book']
#print(book_adaptations[year_book.isnull()])

#print(book_adaptations[book_adaptations['no_author_test'] == True].head(10))

                       title_book author_book  no_author_test year_book  \
2362  One Thousand and One Nights        None            True      None   
2363  One Thousand and One Nights        None            True      None   
2364  One Thousand and One Nights        None            True      None   
2365  One Thousand and One Nights        None            True      None   
2366  One Thousand and One Nights        None            True      None   
2367  One Thousand and One Nights        None            True      None   
2368  One Thousand and One Nights        None            True      None   
2369  One Thousand and One Nights        None            True      None   
2370  One Thousand and One Nights        None            True      None   
2371  One Thousand and One Nights        None            True      None   

                     title_film year_film  
2362        The Thief of Bagdad      1924  
2363        The Thief of Bagdad      1940  
2364             Arabian Nights      1942 