# Project 4

## Load Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import re
import string
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer, TfidfTransformer

# Download required nltk components
nltk.download('stopwords')
nltk.download('punkt')

# Set stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matthewmoore/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/matthewmoore/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## 1. Load the Dataset of Twitter_Data.csv into Memory

In [2]:
# Load dataset (update the file path as needed)
df = pd.read_csv("/Users/matthewmoore/Downloads/Twitter_Data.csv", encoding = 'ISO-8859-1')

# Display dataset overview
df

Unnamed: 0,clean_text,category
0,when modi promised âminimum government maxim...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
...,...,...
162975,why these 456 crores paid neerav modi not reco...,-1.0
162976,dear rss terrorist payal gawar what about modi...,-1.0
162977,did you cover her interaction forum where she ...,0.0
162978,there big project came into india modi dream p...,0.0


### Clean Dataset

In [3]:
from bs4 import BeautifulSoup

def preprocess_text(text):
    # Handle missing values
    if pd.isnull(text):
        return ""
    
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenization
    words = word_tokenize(text)
    
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    
    # Join words back into a cleaned string
    return " ".join(words)

# Apply the cleaning function to the dataset
df['processed_text'] = df['clean_text'].apply(preprocess_text)


df[['clean_text', 'processed_text']].head()


Unnamed: 0,clean_text,processed_text
0,when modi promised âminimum government maxim...,modi promised minimum government maximum gover...
1,talk all the nonsense and continue all the dra...,talk nonsense continue drama vote modi
2,what did just say vote for modi welcome bjp t...,say vote modi welcome bjp told rahul main camp...
3,asking his supporters prefix chowkidar their n...,asking supporters prefix chowkidar names modi ...
4,answer who among these the most powerful world...,answer among powerful world leader today trump...


## 2. Convert the Column of the clean_text to a Matrix of Token Counts Using CountVectorizer and Unigrams and Bigrams

In [4]:
# CountVectorizer w/ unigrams and bigrams
vectorizer = CountVectorizer(ngram_range=(1,2))

# Fit and transform 'processed text' column
matrix = vectorizer.fit_transform(df['processed_text'])

# Convert to DataFrame for visualization
df_counts = pd.DataFrame(matrix.toarray(), columns=vectorizer.get_feature_names())

# Display shape of token count matrix
print(f"Shape of token count matrix: {matrix.shape}")

# Show first few rows of the token matrix
df_counts

Shape of token count matrix: (162980, 1168757)


Unnamed: 0,aa,aa modi,aa urgently,aaa,aaa benefits,aaa certificate,aaa modi,aaa naa,aaa pass,aaa plaaj,...,zyada kuch,zyada north,zyada shaheed,zyada time,zyadamodi,zyadamodi cut,zyonist,zyonist terrorist,zzz,zzz correcting
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162975,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
162976,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
162977,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
162978,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Summarizing the numerical features from the texts
print(f'The size of the feature matrix for the texts = {matrix.get_shape()}')
print(f'The first row of the feature matrix = {matrix[0, ]}.')
print(f'There are {matrix[0, ].count_nonzero()}/{matrix.get_shape()[1]} non-zeros')

The size of the feature matrix for the texts = (162980, 1168757)
The first row of the feature matrix =   (0, 656026)	1
  (0, 822556)	1
  (0, 646757)	1
  (0, 416928)	1
  (0, 630293)	1
  (0, 416522)	1
  (0, 339516)	1
  (0, 96829)	1
  (0, 276787)	1
  (0, 528432)	1
  (0, 863341)	1
  (0, 986514)	2
  (0, 1018753)	1
  (0, 1158576)	1
  (0, 398942)	1
  (0, 535840)	1
  (0, 140849)	1
  (0, 338756)	1
  (0, 830856)	1
  (0, 1031796)	1
  (0, 666803)	1
  (0, 822823)	1
  (0, 646797)	1
  (0, 417931)	1
  (0, 630322)	1
  (0, 416597)	1
  (0, 339537)	1
  (0, 96846)	1
  (0, 276874)	1
  (0, 529084)	1
  (0, 863344)	1
  (0, 987274)	1
  (0, 1019822)	1
  (0, 1159296)	1
  (0, 399790)	1
  (0, 536035)	1
  (0, 986611)	1
  (0, 140961)	1
  (0, 338790)	1
  (0, 830894)	1.
There are 40/1168757 non-zeros


## 3. Perform the tf-idf analysis on the column of the clean_text using CountVectorizer and TfidfTransformer

In [6]:
# TfidfTransformer
tfidf_transformer = TfidfTransformer()

# Apply TF-IDF transformation on CountVectorizer
X_tfidf = tfidf_transformer.fit_transform(matrix)

# Convert to DataFrame
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer.get_feature_names())

# Display shape of TF-IDF matrix
print(f"Shape of TF-IDF matrix: {X_tfidf.shape}")


df_tfidf

Shape of TF-IDF matrix: (162980, 1168757)


Unnamed: 0,aa,aa modi,aa urgently,aaa,aaa benefits,aaa certificate,aaa modi,aaa naa,aaa pass,aaa plaaj,...,zyada kuch,zyada north,zyada shaheed,zyada time,zyadamodi,zyadamodi cut,zyonist,zyonist terrorist,zzz,zzz correcting
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162975,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162976,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162977,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Summarizing numerical features from texts
print(f'The size of the feature matrix for the texts = {X_tfidf.get_shape()}')
print(f'The first row of the feature matrix = {X_tfidf[0, ]}.')
print(f'There are {X_tfidf[0, ].count_nonzero()}/{X_tfidf.get_shape()[1]} non-zeros')

The size of the feature matrix for the texts = (162980, 1168757)
The first row of the feature matrix =   (0, 1159296)	0.17464386943083945
  (0, 1158576)	0.06975422384958631
  (0, 1031796)	0.1380022030425177
  (0, 1019822)	0.16313299509860826
  (0, 1018753)	0.07814839171695152
  (0, 987274)	0.2043990479301708
  (0, 986611)	0.2043990479301708
  (0, 986514)	0.1810544614830256
  (0, 863344)	0.2043990479301708
  (0, 863341)	0.1835948125486262
  (0, 830894)	0.2043990479301708
  (0, 830856)	0.15423981440884985
  (0, 822823)	0.16885965115910462
  (0, 822556)	0.09690202700852703
  (0, 666803)	0.11061759588022564
  (0, 656026)	0.019455602673464422
  (0, 646797)	0.18615474376307067
  (0, 646757)	0.11018397643297576
  (0, 630322)	0.1563995652637393
  (0, 630293)	0.1264913287102865
  (0, 536035)	0.2043990479301708
  (0, 535840)	0.11844037888049314
  (0, 529084)	0.2043990479301708
  (0, 528432)	0.09037938290329108
  (0, 417931)	0.1835948125486262
  (0, 416928)	0.07322956899487
  (0, 416597)	0.204399

## 4. Perform the tf-idf analysis on the column of the clean_text using Tfidvecotrizer

In [8]:
# TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform
X_tfidf_vec = tfidf_vectorizer.fit_transform(df['processed_text'])

# Convert to DataFrame
df_tfidf_vec = pd.DataFrame(X_tfidf_vec.toarray(), columns=tfidf_vectorizer.get_feature_names())

# Display shape
print(f"Shape of TF-IDF vectorized matrix: {X_tfidf_vec.shape}")

df_tfidf_vec

Shape of TF-IDF vectorized matrix: (162980, 101051)


Unnamed: 0,aa,aaa,aaaa,aaaaagg,aaaaand,aaaah,aaaahhhhhhhhhh,aaaand,aaaargh,aaadhar,...,zumle,zumlebaaz,zumlebaj,zunzuna,zut,zvoter,zyada,zyadamodi,zyonist,zzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162975,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162976,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162977,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Summarizing numerical features from texts
print(f'The size of the feature matrix for the texts = {X_tfidf_vec.get_shape()}')
print(f'The first row of the feature matrix = {X_tfidf_vec[0, ]}.')
print(f'There are {X_tfidf_vec[0, ].count_nonzero()}/{X_tfidf_vec.get_shape()[1]} non-zeros')

The size of the feature matrix for the texts = (162980, 101051)
The first row of the feature matrix =   (0, 89364)	0.2597986231072288
  (0, 72354)	0.2903670414550408
  (0, 29809)	0.2764451861218692
  (0, 13187)	0.19871832730961092
  (0, 46971)	0.22297214591545578
  (0, 34510)	0.1382803582709276
  (0, 99967)	0.13131711604960855
  (0, 88429)	0.14711971344293087
  (0, 85716)	0.3408474557918382
  (0, 75225)	0.3456298411052718
  (0, 46354)	0.17014539418853944
  (0, 24556)	0.22377725181177668
  (0, 8982)	0.2506616744607655
  (0, 29872)	0.2217595699645084
  (0, 35608)	0.21101062085313604
  (0, 55317)	0.23812861178608594
  (0, 35637)	0.1378596919191323
  (0, 56606)	0.20742890138461806
  (0, 71750)	0.1824247195920382
  (0, 57445)	0.036626507945892.
There are 20/101051 non-zeros


## 5. Perform the tf-idf analysis on the column of the clean_text using HashingVectorizer and TfidfTransformer.

In [10]:
# HashingVectorizer
hash_vectorizer = HashingVectorizer(n_features=1000)

# Transform text data
X_hash = hash_vectorizer.fit_transform(df['processed_text'])

# Convert sparse matrix to DataFrame
df_hash = pd.DataFrame(X_hash.toarray())

# Display shape
print(f"Shape of hashed matrix: {X_hash.shape}")


df_hash

Shape of hashed matrix: (162980, 1000)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162975,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162976,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162977,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Summarizing numerical features from texts
print(f'The size of the feature matrix for the texts = {X_hash.get_shape()}')
print(f'The first row of the feature matrix = {X_hash[0, ]}.')
print(f'There are {X_hash[0, ].count_nonzero()}/{X_hash.get_shape()[1]} non-zeros')

The size of the feature matrix for the texts = (162980, 1000)
The first row of the feature matrix =   (0, 11)	0.2
  (0, 146)	-0.2
  (0, 209)	-0.2
  (0, 231)	0.2
  (0, 366)	0.4
  (0, 556)	0.2
  (0, 561)	-0.2
  (0, 690)	0.2
  (0, 773)	0.2
  (0, 777)	0.2
  (0, 829)	0.2
  (0, 848)	-0.2
  (0, 854)	0.2
  (0, 878)	0.2
  (0, 898)	0.4
  (0, 930)	-0.2
  (0, 960)	0.2
  (0, 973)	-0.2
  (0, 981)	-0.2.
There are 19/1000 non-zeros
