<a href="https://colab.research.google.com/github/enguyen120/BigDataProject/blob/main/Refactored_TF_IDF_Bag_of_Words.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Mount GDrive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Do imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from pathlib import Path  

from nltk.stem import WordNetLemmatizer
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [3]:
# Create a TFIDF Vectorizer that takes in strings and gives us a bag of words
# ignores any word with freq < 10
tfidf_vectorizer = TfidfVectorizer(input='content', stop_words='english', min_df = 10)

In [4]:
# Get dataframe
df = pd.read_csv('/content/drive/MyDrive/Schoolwork/Big Data Final/refactor/refactored_dataset.csv')

In [5]:
# Get a list of strings containing the content of each article ("corpus")
corpus = [text for text in df['content']]

In [6]:
# Preprocessing - remove nonenglish; remove stopwords; lemmatize
# https://subscription.packtpub.com/book/big-data-and-business-intelligence/9781787285217/7/ch07lvl1sec72/identifying-and-removing-rare-words
stoplist = stopwords.words('english')
corpus2 = []
for article in corpus:
  tokens = [word.lower() for word in nltk.regexp_tokenize(article, '[a-zA-Z]+')] 
  without_stops = [word for word in tokens if word not in stoplist]
  lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in without_stops]
  cleaned_text = ' '.join(lemm_text)
  corpus2.append(cleaned_text)

In [7]:
del corpus
corpus = corpus2

In [8]:
# Fit the vectorizer to the corpus - ie, the vectorizer now knows what words exist.
tfidf_vectorizer.fit(corpus)

TfidfVectorizer(min_df=10, stop_words='english')

In [23]:
# Create the Bag of Words matrix
max = len(corpus)
numlist = list(range(0, max, 10))

#Create the df for the result
tfidf_df = pd.DataFrame()
#For every interval of 10,
for i in range(len(numlist)):
  if(i < len(numlist) - 1):
    # create a bag of words as a sparse matrix
    sparsemat = tfidf_vectorizer.transform(corpus[numlist[i]:numlist[i+1]])
    # turn BoW into a dense matrix, then into a df
    test_df = pd.DataFrame(sparsemat.toarray(), columns=tfidf_vectorizer.get_feature_names())
    # add it to the df for the result
    tfidf_df = pd.concat([tfidf_df, test_df], axis = 0, ignore_index = True)
  else:
    # if there are less than 10 articles left, get all of them
    sparsemat = tfidf_vectorizer.transform(corpus[numlist[i]:max])
    test_df = pd.DataFrame(sparsemat.toarray(), columns=tfidf_vectorizer.get_feature_names())
    tfidf_df = pd.concat([tfidf_df, test_df], axis = 0, ignore_index = True)



In [24]:
# check the result
tfidf_df

Unnamed: 0,aaron,aaronkleinshow,ab,aback,abadi,abandon,abandoned,abandoning,abandonment,abbas,...,zimbabwe,zionist,zip,zombie,zone,zoo,zoom,zooming,zucker,zuckerberg
0,0.0,0.0,0.0,0.0,0.0,0.0,0.015958,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.019500,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7980,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7981,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7982,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7983,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
# add the publication and leftness columns to the tfidf bag of words
leftness_col = df['leftness']
source_col = df['publication']
source_col.name = 'article_source'
new_df = tfidf_df.copy()
new_df = new_df.join(leftness_col)
new_df = new_df.join(source_col)

In [27]:
new_df

Unnamed: 0,aaron,aaronkleinshow,ab,aback,abadi,abandon,abandoned,abandoning,abandonment,abbas,...,zip,zombie,zone,zoo,zoom,zooming,zucker,zuckerberg,leftness,article_source
0,0.0,0.0,0.0,0.0,0.0,0.0,0.015958,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,New York Times
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,New York Times
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,New York Times
3,0.0,0.0,0.0,0.0,0.0,0.0,0.019500,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,New York Times
4,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,New York Times
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7980,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Washington Post
7981,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Washington Post
7982,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Washington Post
7983,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Washington Post


In [28]:
right = len(new_df[new_df['leftness'] == 0])
left = len(new_df[new_df['leftness'] == 1]) 
center = len(new_df[new_df['leftness'] == 0.5]) 
total = len(new_df)
print('RIGHT:', right, "is", round(right / total * 100,2), "%")
print('LEFT:', left, "is", round(left / total * 100,2), "%")
print('CENTER:', center, "is", round(center / total * 100,2), "%")

RIGHT: 2700 is 33.81 %
LEFT: 4453 is 55.77 %
CENTER: 832 is 10.42 %


In [29]:
# save df of the result
tfidf_df.to_csv('/content/drive/MyDrive/Schoolwork/Big Data Final/refactor/refactored_tfdif.csv')