In [122]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import nltk

from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet

# from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.pipeline import Pipeline
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.metrics import classification_report

In [123]:
college = pd.read_csv('./data/college_senti_zeroed.csv')
opins = pd.read_csv('./data/opins_senti_oned.csv')

In [124]:
df = pd.concat([opins, college], axis = 0)

In [125]:
df.head(2)

Unnamed: 0,created_utc,subreddit,post,post_length,post_word_count,sent_score
0,1597545000.0,unpopularopinion,Middle aged guys don't buy sports cars because...,1646,310,1
1,1613406000.0,unpopularopinion,"""Y'all"" is a brilliant use of the English lang...",885,179,1


In [126]:
df.shape

(3584, 6)

In [127]:
df.to_csv('data/combined_posts.csv', index=False)

In [128]:
data = pd.read_csv('data/combined_posts.csv')

In [129]:
data.head(2)

Unnamed: 0,created_utc,subreddit,post,post_length,post_word_count,sent_score
0,1597545000.0,unpopularopinion,Middle aged guys don't buy sports cars because...,1646,310,1
1,1613406000.0,unpopularopinion,"""Y'all"" is a brilliant use of the English lang...",885,179,1


In [130]:
data.drop(['created_utc','post_length', 'post_word_count', 'subreddit'], axis = 1, inplace = True)

In [131]:
data.head(2)

Unnamed: 0,post,sent_score
0,Middle aged guys don't buy sports cars because...,1
1,"""Y'all"" is a brilliant use of the English lang...",1


In [132]:
data.shape

(3584, 2)

# Preparing pooled data for EDA

In [133]:
# Converting text to lower case

data['post'] = data['post'].str.lower()

In [134]:
# removing chars and web addresses like https, www, .com from text
import re
def remove_chars(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\d+', ' ', text)
    return text

In [135]:
# removing chars and web addresses like https, www, .com from text
data['post_nochars'] = data['post'].apply(remove_chars)

In [136]:
data['post_nochars'][2]

'i always eat dessert before dinner at a restaurantwhen at a restaurant, waiting staff always find it weird when i order the dessert before the appetizer and the main course. they ask “oh, is that all that you’re having?”. i’m like “no… i just want my cheesecake first, please”. i have to convince them that i’m just a dessert first kinda guy. i feel like the main course is much more enjoyable when your dopamine levels are boosted.'

In [137]:
# text tokenization
tokenizer = nltk.RegexpTokenizer(r'\w+')
data['post_tokenized'] = data['post_nochars'].apply(tokenizer.tokenize)

In [138]:
len(data['post_tokenized'][2])

81

In [139]:
print(data['post_tokenized'][2])

['i', 'always', 'eat', 'dessert', 'before', 'dinner', 'at', 'a', 'restaurantwhen', 'at', 'a', 'restaurant', 'waiting', 'staff', 'always', 'find', 'it', 'weird', 'when', 'i', 'order', 'the', 'dessert', 'before', 'the', 'appetizer', 'and', 'the', 'main', 'course', 'they', 'ask', 'oh', 'is', 'that', 'all', 'that', 'you', 're', 'having', 'i', 'm', 'like', 'no', 'i', 'just', 'want', 'my', 'cheesecake', 'first', 'please', 'i', 'have', 'to', 'convince', 'them', 'that', 'i', 'm', 'just', 'a', 'dessert', 'first', 'kinda', 'guy', 'i', 'feel', 'like', 'the', 'main', 'course', 'is', 'much', 'more', 'enjoyable', 'when', 'your', 'dopamine', 'levels', 'are', 'boosted']


In [140]:
# remove stopwords
def remove_stopwords(words):
    new_words = [token for token in words if token not in stopwords.words('english')]
    return new_words

In [141]:
data['post_tokenized'] = data['post_tokenized'].apply(remove_stopwords)

In [142]:
data['post_tokenized'][0]

['middle',
 'aged',
 'guys',
 'buy',
 'sports',
 'cars',
 'mid',
 'life',
 'crisis',
 'finally',
 'afford',
 'car',
 'want',
 'hate',
 'hearing',
 'people',
 'say',
 'oh',
 'bought',
 'corvette',
 'balding',
 'needs',
 'feel',
 'younger',
 'someone',
 'never',
 'earned',
 'much',
 'money',
 'made',
 'spectacular',
 'decision',
 'love',
 'rather',
 'lucrative',
 'finally',
 'position',
 'late',
 'actually',
 'save',
 'buy',
 'dream',
 'car',
 'get',
 'cars',
 'important',
 'get',
 'dislike',
 'impact',
 'environment',
 'get',
 'think',
 'sports',
 'cars',
 'expensive',
 'hassle',
 'get',
 'see',
 'forty',
 'something',
 'guy',
 'bmw',
 'assume',
 'compensating',
 'something',
 'realize',
 'automotive',
 'enthusiasm',
 'huge',
 'part',
 'life',
 'lot',
 'people',
 'often',
 'biggest',
 'connection',
 'family',
 'members',
 'friends',
 'car',
 'enthusiast',
 'look',
 'forward',
 'increased',
 'electrification',
 'lower',
 'carbon',
 'footprint',
 'ridiculous',
 'torque',
 'better',
 'acce

#### Text lemmatization

In [143]:
# Parts-of-speech tagging 

#https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

In [144]:
wn = WordNetLemmatizer()
def lemmatize_list(lwords):
    tags = nltk.pos_tag(lwords)
    tagged = [(word, get_wordnet_pos(tag)) for (word, tag) in tags]
    lemmatized_words = [wn.lemmatize(word, tag) if tag != '' else word for (word, tag) in tagged]
    return lemmatized_words

In [145]:
# lemmatizing the text
data['post_lemmatized'] = data['post_tokenized'].apply(lemmatize_list)

In [146]:
data['post_lemmatized']

0       [middle, age, guy, buy, sport, car, mid, life,...
1       [brilliant, use, english, language, refuse, te...
2       [always, eat, dessert, dinner, restaurantwhen,...
3       [illegal, company, list, entry, level, job, re...
4       [news, dry, recitation, fact, opinion, intend,...
                              ...                        
3579    [rough, st, semesteri, finish, first, semester...
3580    [many, flashcard, make, study, exam, take, try...
3581    [grade, receive, first, ever, uni, college, es...
3582    [mental, health, kill, semester, campus, resou...
3583    [mess, please, help, maintain, attendance, int...
Name: post_lemmatized, Length: 3584, dtype: object

In [147]:
data['joined_words'] = data["post_lemmatized"].map(' '.join)
data.head(5)

Unnamed: 0,post,sent_score,post_nochars,post_tokenized,post_lemmatized,joined_words
0,middle aged guys don't buy sports cars because...,1,middle aged guys don't buy sports cars because...,"[middle, aged, guys, buy, sports, cars, mid, l...","[middle, age, guy, buy, sport, car, mid, life,...",middle age guy buy sport car mid life crisis f...
1,"""y'all"" is a brilliant use of the english lang...",1,"""y'all"" is a brilliant use of the english lang...","[brilliant, use, english, language, refuse, to...","[brilliant, use, english, language, refuse, te...",brilliant use english language refuse tell oth...
2,i always eat dessert before dinner at a restau...,1,i always eat dessert before dinner at a restau...,"[always, eat, dessert, dinner, restaurantwhen,...","[always, eat, dessert, dinner, restaurantwhen,...",always eat dessert dinner restaurantwhen resta...
3,it should be illegal for a company to list an ...,1,it should be illegal for a company to list an ...,"[illegal, company, list, entry, level, job, re...","[illegal, company, list, entry, level, job, re...",illegal company list entry level job require y...
4,news should be a dry recitation of facts. if i...,1,news should be a dry recitation of facts. if i...,"[news, dry, recitation, facts, opinion, intend...","[news, dry, recitation, fact, opinion, intend,...",news dry recitation fact opinion intend stir e...


#### Save lemmatized pooled posts as .CSV file for future use

In [148]:
data.to_csv('data/lemmatized_posts.csv', index = False)

In [149]:
X = data['joined_words']
y = data['sent_score']

In [150]:
y.value_counts(normalize = True)

sent_score
1    0.53404
0    0.46596
Name: proportion, dtype: float64

### Preparing college posts for predictions from various trained ML models

In [151]:
college = pd.read_csv('./data/college_senti_zeroed.csv')

In [152]:
college.head(1)

Unnamed: 0,created_utc,subreddit,post,post_length,post_word_count,sent_score
0,1543598000.0,college,Ever have a kid in class show up and realize i...,871,174,0


In [153]:
college.drop(columns = ['created_utc', 'subreddit', 'post_length', 'post_word_count'], inplace = True)

In [154]:
college.head()

Unnamed: 0,post,sent_score
0,Ever have a kid in class show up and realize i...,0
1,"It’s the little things that count, this profes...",0
2,Professor saved my assDuring winter term at my...,0
3,My professor gave out all the answers to the f...,0
4,An Honest Letter from Your University Presiden...,0


In [155]:
college['post'] = college['post'].str.lower()

In [156]:
import re
def remove_chars(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\d+', ' ', text)
    return text

In [157]:
college['post_tokenized'] = college['post'].apply(remove_chars)

In [158]:
len(college['post_tokenized'][3])

921

In [159]:
tokenizer = nltk.RegexpTokenizer(r'\w+')
college['post_tokenized'] = college['post_tokenized'].apply(tokenizer.tokenize)

In [160]:
len(college['post_tokenized'][3])

175

In [161]:
print(college['post_tokenized'][3])

['my', 'professor', 'gave', 'out', 'all', 'the', 'answers', 'to', 'the', 'final', 'i', 'just', 'left', 'the', 'exam', 'hall', 'for', 'my', 'cellular', 'and', 'molecular', 'biology', 'class', 'gen', 'bio', 'and', 'i', 'am', 'absolutely', 'baffled', 'right', 'now', 'last', 'week', 'my', 'professor', 'gave', 'us', 'a', 'packet', 'with', 'questions', 'and', 'also', 'gave', 'the', 'answers', 'she', 'said', 'they', 'would', 'be', 'a', 'similar', 'style', 'to', 'the', 'final', 'and', 'give', 'us', 'an', 'idea', 'of', 'what', 'topics', 'to', 'study', 'most', 'i', 'just', 'took', 'the', 'exam', 'and', 'it', 'was', 'literally', 'the', 'review', 'packet', 'question', 'for', 'question', 'she', 'even', 'reprinted', 'it', 'with', 'the', 'word', 'review', 'obviously', 'crossed', 'out', 'so', 'it', 'just', 'read', 'final', 'exam', 'needless', 'to', 'say', 'i', 'finished', 'it', 'in', 'minutes', 'as', 'did', 'half', 'of', 'the', 'class', 'i', 'genuinely', 'think', 'she', 'intended', 'to', 'write', 'a',

In [162]:
def remove_stopwords(words):
    new_words = [token for token in words if token not in stopwords.words('english')]
    return new_words

In [163]:
college['post_tokenized'] = college['post_tokenized'].apply(remove_stopwords)

In [164]:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

In [165]:
def lemmatize_list(lwords):
    tags = nltk.pos_tag(lwords)
    tagged = [(word, get_wordnet_pos(tag)) for (word, tag) in tags]
    lemmatized_words = [wn.lemmatize(word, tag) if tag != '' else word for (word, tag) in tagged]
    return lemmatized_words

In [166]:
college['post_lemmatized'] = college['post_tokenized'].apply(lemmatize_list)

In [167]:
college['joined_words'] = college["post_lemmatized"].map(' '.join)
college.head(5)

Unnamed: 0,post,sent_score,post_tokenized,post_lemmatized,joined_words
0,ever have a kid in class show up and realize i...,0,"[ever, kid, class, show, realize, exam, day, g...","[ever, kid, class, show, realize, exam, day, g...",ever kid class show realize exam day get leave...
1,"it’s the little things that count, this profes...",0,"[little, things, count, professor, gem, stayed...","[little, thing, count, professor, gem, stay, l...",little thing count professor gem stay late com...
2,professor saved my assduring winter term at my...,0,"[professor, saved, assduring, winter, term, un...","[professor, save, assduring, winter, term, uni...",professor save assduring winter term uni bad m...
3,my professor gave out all the answers to the f...,0,"[professor, gave, answers, final, left, exam, ...","[professor, give, answer, final, leave, exam, ...",professor give answer final leave exam hall ce...
4,an honest letter from your university presiden...,0,"[honest, letter, university, president, openin...","[honest, letter, university, president, open, ...",honest letter university president open fallde...


#### Save lemmatized college posts as .CSV file for future use

In [168]:
college.to_csv('./data/coll_lemmatized_posts.csv', index = False)