In [1]:
import os
import re
import pickle
import pandas as pd
import spacy
import nltk
import contractions
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from spacy.lang.en import STOP_WORDS as spacy_stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from sklearn.decomposition import TruncatedSVD

In [2]:
nlp = spacy.load("en_core_web_md",disable=["ner","parser"])

In [None]:
nltk.download('punkt')

In [4]:
combined_stopwords = set(stopwords.words('english')).union(set(spacy_stopwords))

In [5]:
data = pd.read_csv("training.1600000.processed.noemoticon.csv",encoding="latin-1",header=None,names=["sentiment","id","date","flag","username","text"])

In [None]:
data.head(5)

In [7]:
data.drop(labels=data.columns[1:5],axis=1,inplace=True)

In [None]:
data.head(5)

In [9]:
def normalize_tweet(tweet):
    return tweet.lower()

In [None]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:
    data["text"] = list(pool.map(normalize_tweet,data["text"]))

In [19]:
def fix_contractions(tweet):
    return contractions.fix(tweet)

In [20]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:
    data["text"] = list(pool.map(fix_contractions,data["text"]))

In [21]:
def remove_noisy_tokens(tweet):
    return re.sub(pattern=r'@[a-zA-Z0-9_]+|#[a-zA-Z0-9_]+|http\S+|\W+|\d+',
                  string=tweet, repl=" ")

In [15]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:
    data["text"] = list(pool.map(remove_noisy_tokens,data["text"]))

In [16]:
def remove_remaining_noisy_tokens(tweet):
    return re.sub(pattern=r'\b\w\b|\s+',
        string=tweet, repl=" ").strip()

In [17]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:
    data["text"] = list(pool.map(remove_remaining_noisy_tokens,data["text"]))

In [None]:
data.head(5)

In [None]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:  
    data["text"] = list(pool.map(word_tokenize, data["text"]))


In [47]:
def is_stopword(token):
    return token not in combined_stopwords

In [48]:
def remove_stopwords(tokenized_tweet):
    return [token for token in tokenized_tweet if is_stopword(token)]

In [None]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:
    data["text"] = list(pool.map(remove_stopwords,data["text"]))

In [None]:
with open("stopwords_removed.pkl","wb") as file_handle:
    pickle.dump(data["text"],file_handle)

In [None]:
def lemmatize_tweet(tokenized_tweet):

    raw_tweet = " ".join(tokenized_tweet)
    doc = nlp(raw_tweet)
    lemmatized_tweet = list()

    for token in doc:
        lemmatized_tweet.append(token.lemma_)

    return lemmatized_tweet

In [None]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:
    data["text"] = list(tqdm(pool.map(lemmatize_tweet,data["text"])))

In [None]:
with open("lemmatized_tweets.pkl","wb") as file_handle:
    pickle.dump(data["text"],file_handle)

In [None]:
data = pd.DataFrame()

with open("lemmatized_tweets.pkl","rb") as file_handle:
    data["text"] = pickle.load(file_handle)

In [None]:
converted_raw_text = list(data["text"].apply(lambda x: " ".join(x)))

In [None]:
converted_raw_text = list(filter(lambda x: len(x) > 0,converted_raw_text))

In [None]:
len(converted_raw_text)

In [None]:
vocab = set()

for cleaned_tweet in converted_raw_text:
    vocab.update(set(cleaned_tweet.split(" ")))

In [None]:
len(vocab)

In [None]:
vectorizer = TfidfVectorizer()
vectorized_text = vectorizer.fit_transform(converted_raw_text)

In [None]:
vectorized_text.shape

In [None]:
cumulative_tfs = Counter()
for cleaned_tweet in data["text"]:

    cumulative_tfs.update(cleaned_tweet)

In [None]:
most_frequent_tokens = cumulative_tfs.most_common(30000)
most_frequent_tokens = dict(most_frequent_tokens)
truncated_vocab = list(most_frequent_tokens.keys())

truncated_vocab2idx = dict(zip(truncated_vocab,range(len(truncated_vocab))))

In [None]:
vectorizer = TfidfVectorizer(vocabulary=truncated_vocab2idx)
vectorized_text = vectorizer.fit_transform(converted_raw_text)

In [None]:
svd = TruncatedSVD ( n_components =64)
svd.fit( truncated_tfidf_matrix )