<a href="https://colab.research.google.com/github/enguyen120/BigDataProject/blob/main/Refactored_Code_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Get ready

In [1]:
# Download data: articles1, articles2, articles3; media-bias-fc-scrape

In [3]:
# Mount GDrive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# import
import os
import pandas as pd
import re
import numpy as np

# Unify article data

In [6]:
# Combine our data into one pandas df, called "data_df"
# https://www.geeksforgeeks.org/how-to-merge-multiple-csv-files-into-a-single-pandas-dataframe/
filenames = ['/content/drive/MyDrive/Schoolwork/Big Data Final/articles1.csv',
              '/content/drive/MyDrive/Schoolwork/Big Data Final/articles2.csv',
              '/content/drive/MyDrive/Schoolwork/Big Data Final/articles3.csv']
data_df = pd.concat(map(pd.read_csv, filenames), ignore_index=True)

In [7]:
data_df.shape

(142570, 10)

In [8]:
# select only articles that were published in the november of 2016 
data_df = data_df[data_df.month == 11.0]
data_df = data_df[data_df.year == 2016.0]

In [9]:
data_df

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
2671,2671,20256,"Dilma Rousseff, Facing Impeachment in Brazil, ...",New York Times,Andrew Jacobs,2016-11-15,2016.0,11.0,,"BRASÍLIA — They were idealists, united in t..."
2679,2679,20266,Los Angeles Police Official Resigns Over Racis...,New York Times,Liam Stack,2016-11-17,2016.0,11.0,,A official in the Los Angeles County Sheriff...
2689,2689,20276,Man Wrongfully Convicted of Murder Awaits His ...,New York Times,Alan Feuer,2016-11-14,2016.0,11.0,,"A couple of years ago, Paul Gatling, a retired..."
2691,2691,20279,Ted Cruz Suspends His Campaign for President -...,New York Times,Matt Flegenheimer,2016-11-16,2016.0,11.0,,"INDIANAPOLIS — Less than a month ago, Senat..."
2710,2710,20300,Republican Party Unravels Over Donald Trump’s ...,New York Times,Patrick Healy and Jonathan Martin,2016-11-15,2016.0,11.0,,By seizing the Republican presidential nominat...
...,...,...,...,...,...,...,...,...,...,...
142046,145509,217398,We weren’t exclusive. But when I saw him with ...,Washington Post,Danielle Sepulveres,2016-11-30,2016.0,11.0,https://web.archive.org/web/20161204005809/htt...,“Are you really heartbroken or is your eg...
142047,145510,217399,The rules for setting your holiday table (and ...,Washington Post,Jura Koncius,2016-11-29,2016.0,11.0,https://web.archive.org/web/20161204005809/htt...,"Setting the table is often a hurried, chore..."
142048,145511,217400,Their 5-year-old is aggressive and quick to bl...,Washington Post,Meghan Leahy,2016-11-30,2016.0,11.0,https://web.archive.org/web/20161204005809/htt...,"Q: We have four boys, ages 6, 5 and twin ...."
142055,145518,217413,Trump is surrounding himself with generals. Th...,Washington Post,Phillip Carter,2016-11-30,2016.0,11.0,https://web.archive.org/web/20161205004132/htt...,Phillip Carter is a former Army officer an...


In [10]:
data_df.shape

(7985, 10)

In [11]:
data_df.columns

Index(['Unnamed: 0', 'id', 'title', 'publication', 'author', 'date', 'year',
       'month', 'url', 'content'],
      dtype='object')

In [12]:
data_df['id'].is_unique

True

In [13]:
# get only relevant columns
nov_df = data_df[['publication', 'content']].copy()

In [14]:
nov_df.head()

Unnamed: 0,publication,content
2671,New York Times,"BRASÍLIA — They were idealists, united in t..."
2679,New York Times,A official in the Los Angeles County Sheriff...
2689,New York Times,"A couple of years ago, Paul Gatling, a retired..."
2691,New York Times,"INDIANAPOLIS — Less than a month ago, Senat..."
2710,New York Times,By seizing the Republican presidential nominat...


In [15]:
nov_df.shape

(7985, 2)

# Unify bias data

In [16]:
# load media-bias-fc-scrape
mbfc_df = pd.read_csv("/content/drive/MyDrive/Schoolwork/Big Data Final/media-bias-fc-scrape.csv")

In [17]:
mbfc_df.head()

Unnamed: 0,site_name,url,bias_png,factual_reporting
0,Act.TV,http://act.tv,left4,HIGH
1,Addicting Info,http://addictinginfo.com,left5,MIXED
2,Advocate,http://www.advocate.com/,left8,HIGH
3,Akkadian Times,http://akkadiantimes.com,left8,HIGH
4,Alliance for Justice (AFJ),https://www.afj.org/,left8,HIGH


In [18]:
# remove numbers from "bias_png" column
# https://stackoverflow.com/questions/41719259/how-to-remove-numbers-from-string-terms-in-a-pandas-dataframe
mbfc_df["bias_png"] = mbfc_df["bias_png"].str.replace('\d+', '')

  mbfc_df["bias_png"] = mbfc_df["bias_png"].str.replace('\d+', '')


In [19]:
# check values in "bias_png" column
mbfc_df['bias_png'].unique()

array(['left', 'extremeleft', 'leftcenter', 'leastbiased', 'rightcenter',
       'right', 'extremeright'], dtype=object)

In [20]:
# create new column, "bias", with value based on bias_png value
def sorter(x):
  if x == "left":
    return 1
  elif x == "extremeleft":
    return 1
  elif x == "leftcenter":
    return 1
  elif x == "leastbiased":
    return 0.5
  elif x == "rightcenter":
    return 0
  elif x == "right":
    return 0
  elif x == "extremeright":
    return 0

mbfc_df['bias'] = mbfc_df['bias_png'].apply(sorter)

In [21]:
mbfc_df.head()

Unnamed: 0,site_name,url,bias_png,factual_reporting,bias
0,Act.TV,http://act.tv,left,HIGH,1.0
1,Addicting Info,http://addictinginfo.com,left,MIXED,1.0
2,Advocate,http://www.advocate.com/,left,HIGH,1.0
3,Akkadian Times,http://akkadiantimes.com,left,HIGH,1.0
4,Alliance for Justice (AFJ),https://www.afj.org/,left,HIGH,1.0


In [22]:
bias_df = mbfc_df[['site_name', 'bias']].copy()

In [23]:
bias_df.head()

Unnamed: 0,site_name,bias
0,Act.TV,1.0
1,Addicting Info,1.0
2,Advocate,1.0
3,Akkadian Times,1.0
4,Alliance for Justice (AFJ),1.0


# Clean bias data

In [24]:
# There are a lot more sources in the bias dataset than in the article dataset.
bias_sources = bias_df['site_name'].unique()
article_sources = nov_df['publication'].unique()
print(len(bias_sources), "sources in Bias dataset;", len(article_sources), "sources in Article dataset.")

1598 sources in Bias dataset; 15 sources in Article dataset.


In [25]:
# Find which sources in the article dataset don't map exactly
# to sources in the bias dataset:
# Breitbart, Atlantic, Buzzfeed News, Guardian, NPR
for x in article_sources:
  if(x not in bias_sources):
    print(x)

Breitbart
Atlantic
Buzzfeed News
Guardian
NPR


In [26]:
# Now, locate what source in bias_sources corresponds to each inexact one in the article dataset.
# Change the source name in bias_sources to correspond exactly to the one in the article dataset.

In [27]:
temp = [x for x in bias_sources if re.search('Atlantic', x)]
print(temp)

['The Atlantic', 'Atlantic Media', 'Atlantic Council']


In [28]:
print(bias_df.loc[bias_df['site_name'] == 'The Atlantic'])

        site_name  bias
333  The Atlantic   1.0


In [29]:
print(bias_df.iloc[333,0])
bias_df.iloc[333,0] = 'Atlantic'
print(bias_df.iloc[333,0])

The Atlantic
Atlantic


In [30]:
temp = [x for x in bias_sources if re.search('Buzzfeed', x)]
print(temp)

['Buzzfeed']


In [31]:
print(bias_df.loc[bias_df['site_name'] == 'Buzzfeed'])

    site_name  bias
359  Buzzfeed   1.0


In [32]:
bias_df.iloc[359,0] = 'Buzzfeed News'
print(bias_df.iloc[359])

site_name    Buzzfeed News
bias                   1.0
Name: 359, dtype: object


In [33]:
temp = [x for x in bias_sources if re.search('Guardian', x)]
print(temp)

['Democracy Guardian', 'Guardian Liberty Voice', 'Guardians of Democracy', 'The Guardian']


In [34]:
print(bias_df.loc[bias_df['site_name'] == 'The Guardian'])

        site_name  bias
668  The Guardian   1.0


In [35]:
print(bias_df.iloc[668,0])
bias_df.iloc[668,0] = 'Guardian'
print(bias_df.iloc[668,0])

The Guardian
Guardian


In [36]:
temp = [x for x in bias_sources if re.search('NPR', x)]
print(temp)

['KUOW NPR', 'NPR (National Public Radio)']


In [37]:
print(bias_df.loc[bias_df['site_name'] == 'NPR (National Public Radio)'])

                       site_name  bias
564  NPR (National Public Radio)   1.0


In [38]:
print(bias_df.iloc[564,0])
bias_df.iloc[564,0] = 'NPR'
print(bias_df.iloc[564,0])

NPR (National Public Radio)
NPR


In [39]:
temp = [x for x in bias_sources if re.search('Breitbart', x)]
print(temp)

['Breitbart Unmasked']


In [40]:
# Breitbart is not included in the bias dataset.
# We used background knowledge to determine that Breitbart is a right-leaning source,
# and added it manually.
bias_df.loc[len(bias_df.index)] = ['Breitbart', 0.0]

In [41]:
# check that it was added correctly
bias_df.loc[len(bias_df.index) - 1]

site_name    Breitbart
bias               0.0
Name: 1604, dtype: object

In [42]:
bias_df

Unnamed: 0,site_name,bias
0,Act.TV,1.0
1,Addicting Info,1.0
2,Advocate,1.0
3,Akkadian Times,1.0
4,Alliance for Justice (AFJ),1.0
...,...,...
1600,World Tribune,0.0
1601,YellowHammer News,0.0
1602,Young America’s Foundation (YAF),0.0
1603,Young Conservatives,0.0


# Connect bias data to article data

In [43]:
# Create a series mapping every publication in the article data
# to its corresponding bias from the bias data
# Stack Overflow was used for this; link has been lost
bias_dict = dict(zip(bias_df['site_name'], bias_df['bias']))
leftness_df = data_df['publication'].map( dict( bias_dict ) )

In [44]:
leftness_df

2671      1.0
2679      1.0
2689      1.0
2691      1.0
2710      1.0
         ... 
142046    1.0
142047    1.0
142048    1.0
142055    1.0
142058    1.0
Name: publication, Length: 7985, dtype: float64

In [45]:
# Rename the column "publication" to "leftness" in leftness_df
print(leftness_df.name)
leftness_df.name = "leftness"
print(leftness_df.name)

publication
leftness


In [46]:
# Add 'leftness' column to new df
new_df = nov_df.join(leftness_df)

In [47]:
new_df.head()

Unnamed: 0,publication,content,leftness
2671,New York Times,"BRASÍLIA — They were idealists, united in t...",1.0
2679,New York Times,A official in the Los Angeles County Sheriff...,1.0
2689,New York Times,"A couple of years ago, Paul Gatling, a retired...",1.0
2691,New York Times,"INDIANAPOLIS — Less than a month ago, Senat...",1.0
2710,New York Times,By seizing the Republican presidential nominat...,1.0


In [53]:
new_df.shape

(7985, 3)

In [49]:
right = len(new_df[new_df['leftness'] == 0])
left = len(new_df[new_df['leftness'] == 1]) 
center = len(new_df[new_df['leftness'] == 0.5]) 
total = len(new_df)
print('RIGHT:', right, "is", round(right / total * 100,2), "%")
print('LEFT:', left, "is", round(left / total * 100,2), "%")
print('CENTER:', center, "is", round(center / total * 100,2), "%")

RIGHT: 2700 is 33.81 %
LEFT: 4453 is 55.77 %
CENTER: 832 is 10.42 %


In [54]:
# https://stackoverflow.com/questions/45745085/python-pandas-how-to-remove-nan-and-inf-values
clean_df = new_df.dropna(inplace=False)

In [55]:
clean_df.shape

(7985, 3)

In [56]:
clean_df.to_csv('/content/drive/MyDrive/Schoolwork/Big Data Final/refactor/refactored_dataset.csv')