In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import os
from datetime import datetime, timedelta
import dateutil.parser
import requests
from bs4 import BeautifulSoup
import re

In [2]:
def parse_ny_post_dates(df_row):
    date_string = df_row['date']
    date_string = date_string.replace('.m.', 'M').replace('ET', '').strip()
    date_format = "%b. %d, %Y, %I:%M %p"
    try:
        parsed_date = datetime.strptime(date_string, date_format)
        df_row['day'] = parsed_date.day
        df_row['month'] = parsed_date.month
        df_row['year'] = parsed_date.year
    except ValueError:
        df_row['day'] = np.nan
        df_row['month'] = np.nan
        df_row['year'] = np.nan
    return df_row

In [3]:
def parse_guardian_dates(df_row):
    date_string = df_row['date']
    date_string = date_string.split(' ')[1:-1]
    date_string = ' '.join(date_string).replace('.', ':')

    date_format = "%d %b %Y %H:%M"

    try:
        parsed_date = datetime.strptime(date_string, date_format)
        df_row['day'] = parsed_date.day
        df_row['month'] = parsed_date.month
        df_row['year'] = parsed_date.year
    except ValueError:
        df_row['day'] = np.nan
        df_row['month'] = np.nan
        df_row['year'] = np.nan
    
    return df_row

In [4]:
def parse_fox_news_dates(df_row):
    date_string = df_row['date']
    date_string = date_string.replace('EST', '').replace('pm', 'PM').replace('am', 'AM').strip()
    date_format_with_time = "%B %d, %Y %I:%M%p"
    date_format_without_time = "%B %d, %Y"
    try:
        parsed_date = datetime.strptime(date_string, date_format_with_time)
    except ValueError:
        try:
            parsed_date = datetime.strptime(date_string, date_format_without_time)
        except ValueError:
            df_row['day'] = np.nan
            df_row['month'] = np.nan
            df_row['year'] = np.nan
            return df_row
    df_row['day'] = parsed_date.day
    df_row['month'] = parsed_date.month
    df_row['year'] = parsed_date.year
    
    return df_row

In [5]:
def parse_atlantic_dates(df_row):
    date_string = df_row['date']
    try:
        parsed_date = datetime.fromisoformat(date_string.rstrip('Z'))
        df_row['day'] = parsed_date.day
        df_row['month'] = parsed_date.month
        df_row['year'] = parsed_date.year
    except ValueError:
        df_row['day'] = np.nan
        df_row['month'] = np.nan
        df_row['year'] = np.nan
    
    return df_row

In [6]:
def parse_cnn_dates(df_row):
    date_string = df_row['date']
    date_string = date_string.replace(' EST', '').replace(' EDT', '').strip()
    date_format = "%I:%M %p, %a %B %d, %Y"

    try:
        parsed_date = datetime.strptime(date_string, date_format)
        df_row['day'] = parsed_date.day
        df_row['month'] = parsed_date.month
        df_row['year'] = parsed_date.year
    except ValueError:
        df_row['day'] = np.nan
        df_row['month'] = np.nan
        df_row['year'] = np.nan
    
    return df_row

In [7]:
def parse_business_insider_dates(df_row):
    date_string = df_row['date']
    try:
        parsed_date = datetime.fromisoformat(date_string.rstrip('Z'))
        df_row['day'] = parsed_date.day
        df_row['month'] = parsed_date.month
        df_row['year'] = parsed_date.year
    except ValueError:
        df_row['day'] = 'Unknown'
        df_row['month'] = 'Unknown'
        df_row['year'] = 'Unknown'
    
    return df_row

In [8]:
def parse_washington_post_dates(df_row):
    date_string = df_row['date']
    try:
        date_string = date_string.replace('at', '').replace('.m.', 'M').replace('EST', '').strip()
        date_format = "%B %d, %Y %I:%M %p"
        parsed_date = datetime.strptime(date_string, date_format)
        df_row['day'] = parsed_date.day
        df_row['month'] = parsed_date.month
        df_row['year'] = parsed_date.year
    except:
        df_row['day'] = np.nan
        df_row['month'] = np.nan
        df_row['year'] = np.nan
    return df_row

In [9]:
def clean_data(df_row):
    """
    Replaces reoccuring phrases in content which are not useful
    """
    try:
        df_row['content'] = df_row['content'].replace('Δ Thanks for contacting us. We\'ve received your submission.', '')
        df_row['content'] = df_row['content'].replace('Fox News Flash top headlines are here. Check out what\'s clicking on Foxnews.com', '')
    except:
        pass
    return df_row

In [10]:
def separate_joined_words(text):
    """
    Separates words like 'JUSTin' which have a bunch of capital letters followed by a small letter which is the start of 
    another word
    """
    try:
        pattern = re.compile(r'(?<=[a-z])(?=[A-Z])')
        separated_text = pattern.sub(' ', text)
        return separated_text
    except:
        return text

In [11]:
def filter_scraped_data(df1):
    """
    Takes unfiltered scraped dataframe as input and outputs filtered scraped dataframe
    """
    df1.drop(['summary'],axis=1,inplace=True)
    
    date_info_dict = {
            'NY Post': parse_ny_post_dates,
            'Atlantic': parse_atlantic_dates,
            'CNN': parse_cnn_dates,
            'Business Insider': parse_business_insider_dates,
            'Washington Post': parse_washington_post_dates,
            'Fox News': parse_fox_news_dates,
            'Guardian': parse_guardian_dates
        }
    
    df1['day'] = np.nan
    df1['month'] = np.nan
    df1['year'] = np.nan

    for name in date_info_dict:
        df1[df1['name'] == name] = df1[df1['name'] == name].apply(date_info_dict[name], axis=1)
    
    df1.dropna(subset=['content'], inplace=True)
    df1.reset_index(drop=True, inplace=True)
    
    df1 = df1.apply(clean_data, axis=1)
    
    df1['content'] = df1['content'].apply(separate_joined_words)
    df1['title'] = df1['title'].apply(separate_joined_words)
    
    return df1

In [12]:
filter_scraped_data(pd.read_csv('new_scraping_data.csv',index_col=0))

NameError: name 'input_filename' is not defined