In [1]:
# packages
import pandas as pd

import numpy as np
np.random.seed(2021)

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
#nltk.download('wordnet') #download if not present already

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors
from sklearn import datasets
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


In [2]:
# cleaning functions

def cleaning(text): # to remove mentions and links, taken partially from juejue's and jamie's notebooks
    text = text.replace("\\n", " ") # dropping \n
    text = re.sub(r'[^\w\s]', ' ', text) # remove punctuation
    text = re.sub('\S*@\S*\s?', '',text) # remove emails
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ', text) # remove links
    text = re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", '', text) # also removes links?
    text = re.sub(r'\d+', '', text) # remove numbers
    
    return text


def clean_stem(tweets, rawtweets = True): # cleaning and stemming
    
    if rawtweets == True:
        # raw tweets pulled straight from Twitter
        # clears the 'b and the ' at the start and end of the tweet
        tweets = tweets.str[2:] # dropping first two characters apostrophe and b
        tweets = tweets.str[:-1] # dropping last character apostrophe
    
    # removing punctuation, \\n, links, etc.
    tweets = tweets.apply(cleaning)
    
    # stemming the words
    stemmer = SnowballStemmer("english")
    text = tweets.to_list()
    process_tweets = []
    for sentence in text:
        # iterate through each word in a tweet/sentence, if not part of stopwords list then keep in tweet
        process_tweets.append(" ".join([stemmer.stem(i) for i in sentence.split() if i not in gensim.parsing.preprocessing.STOPWORDS]))
    
    #return list of cleaned tweets
    return process_tweets
   

In [3]:
# other functions
vectorizer = CountVectorizer(min_df=2) # min_df is minimum document frequency to include token/word

In [4]:
tweets = pd.read_csv("../Data/vaccine_college_500.csv")
tweets = tweets.drop_duplicates(subset=["text"], keep='first') # remove duplicate tweets
tweets = tweets[~tweets["text"].str.contains("Pin Code")] # remove tweets that have "Pin Code"

# cleaning
text = clean_stem(tweets["text"], rawtweets = True)

In [5]:
freq_matrix = vectorizer.fit_transform(text).toarray()