In [140]:
# PROBLEM 1

import pandas as pd

''' 
making the initial dataframes
(i delay concatenating them so that i can potentially reuse each with different "versions" of the other)
'''

# filepath to the folder containing the metadata
corpus_path = '/Users/faithrta/engl_490_python/LESSON_9_PSET_DATA/'

# making dataframes for each csv file (one for the reddit data, another for the nyt one)
meta_reddit = pd.read_csv(corpus_path + 'REDDIT_news_2008_meta.csv', encoding='latin1')
meta_nyt = pd.read_csv(corpus_path + 'NYT_2008_META.csv', encoding='latin1')

# deleting all columns of the original csv files except for the one containing filenames
meta_reddit = meta_reddit[['file_id']].rename(columns = {"file_id": "FILENAME"})
meta_nyt = meta_nyt[['file_id']].rename(columns = {"file_id": "FILENAME"})

# creating a new column stating the source of each text (for when they're concatenated)
meta_reddit['SOURCE'] = 'Reddit'
meta_nyt['SOURCE'] = 'NYT'

# with the original datasets, my code never stopped running; I've thus cut down
# the number of entries I take from reddit and the nyt, unfortunately
meta_reddit_cut = meta_reddit.iloc[0:5000]
meta_nyt_cut = meta_nyt.iloc[0:5000]

meta_reddit_cut.head(10)


Unnamed: 0,FILENAME,SOURCE
0,c035ug0.txt,Reddit
1,c039a2t.txt,Reddit
2,c03ad2h.txt,Reddit
3,c03cnx9.txt,Reddit
4,c03d5p6.txt,Reddit
5,c03de8j.txt,Reddit
6,c03deaq.txt,Reddit
7,c03dlna.txt,Reddit
8,c03g070.txt,Reddit
9,c03gq7p.txt,Reddit


In [141]:
# PROBLEM 1 contd (pt 2)

import codecs

''' 
adding the full text of each file as a column
(technically this isn't needed until later but i thought it'd be good for me to see the texts
to generate ideas for question 2)
'''

# added this because I was getting an error message for "meta_reddit_cut['TEXT'] = ''"
pd.options.mode.chained_assignment = None

# adding new columns for each file's full text
meta_reddit_cut['TEXT'] = ''
meta_nyt_cut['TEXT'] = ''

# ----------------------------------- reddit ---------------------------------- #

for index, row in meta_reddit_cut.iterrows():
    # making a new filename that includes the file's path
    filepath_name = 'REDDIT_2008_text/' + row['FILENAME']
    text = codecs.open(corpus_path + filepath_name, "r", encoding='utf8').read()
    
    # saving the new filename and the full text of the current reddit post
    meta_reddit_cut.at[index, 'FILENAME'] = filepath_name
    meta_reddit_cut.at[index, 'TEXT'] = text
    
# ------------------------------------- nyt ------------------------------------ #
    
# when trying to open files using the filenames listed the NYT csv, I found some
# entries that didn't have corresponding files in the NYT folder;
# i thus use this list to save the indices of rows to drop
drop_rows = []
    
for index, row in meta_nyt_cut.iterrows():
    # making a new filename that includes the file's path
    filepath_name = 'NYT_' + row['FILENAME']
    
    # using a try-catch block as I ran into a FileNotFoundError while testing
    # (there were rows in the nyt csv file with filenames that do not exist in the nyt folder)
    try:
        text = codecs.open(corpus_path + filepath_name, "r", encoding='utf8').read()
        
    # if the current file cannot be opened (it DNE), adds the current index to drop_rows
    except FileNotFoundError:
        drop_rows.append(index)
        
        # continues to the next nyt article
        continue
    
    # saving the new filename and the full text of the current article (if its file exists)
    meta_nyt_cut.at[index, 'FILENAME'] = filepath_name
    meta_nyt_cut.at[index, 'TEXT'] = text

# dropping the rows that do not have valid files in the nyt folder
meta_nyt_cut = meta_nyt_cut.drop(drop_rows)
meta_nyt_cut = meta_nyt_cut.reset_index(drop=True)

# -------------------------------- concatenation ------------------------------- #

# combining the metadata for reddit and nyt into one dataframe and resetting the indices
meta_both = pd.concat([meta_reddit_cut, meta_nyt_cut])
meta_both = meta_both.reset_index(drop=True)

meta_both.head(-20)

Unnamed: 0,FILENAME,SOURCE,TEXT
0,REDDIT_2008_text/c035ug0.txt,Reddit,On your social networks all your friends and c...
1,REDDIT_2008_text/c039a2t.txt,Reddit,Clearly Angela Durante is in dire need of a cu...
2,REDDIT_2008_text/c03ad2h.txt,Reddit,"If you enjoyed this video, you will love this ..."
3,REDDIT_2008_text/c03cnx9.txt,Reddit,Seriously doubt truthnews factors into that.
4,REDDIT_2008_text/c03d5p6.txt,Reddit,I have a major issue with the work skeptic the...
...,...,...,...
9974,NYT_2008/5/5481dcc338f0d874625c9456.txt,NYT,BAGHDAD — Prime Minister \nNuri Kamal al-Malik...
9975,NYT_2008/5/5481dc6a38f0d874625c944b.txt,NYT,Hoping to curb the increase in the number of y...
9976,NYT_2008/5/5481dc6538f0d874625c944a.txt,NYT,"BEIRUT, Lebanon — A gunman opened fire in a mo..."
9977,NYT_2008/5/5481dc5d38f0d874625c9449.txt,NYT,PARIS — It is \nVladimir V. Putin\n’s first tr...


In [142]:
# PROBLEM 1 contd (pt 3)

import re

''' making a custom pre-processor for the next cell's CountVectorizer '''

# elimintes numbers and instances of "_", "\", and "—"
def my_preprocessor(text):
    text = text.lower()
    text = re.sub('([0-9—_\\\\])', '', text)
    return text

In [143]:
# PROBLEM 1 contd (pt 4)

from sklearn.feature_extraction.text import CountVectorizer

''' using CountVectorizer to make a DTM based on the words in the corpus '''

# creating a new vecorizer
vectorizer = CountVectorizer(input='content', preprocessor=my_preprocessor, stop_words='english', min_df=5, encoding='utf8')
dtm = vectorizer.fit_transform(meta_both['TEXT'])
vocab = vectorizer.get_feature_names()
matrix = dtm.toarray()

# combining the DTM with the metadata
DTM = pd.DataFrame(matrix, columns=vocab)

# attaching the DTM to the original dataframe
dtm_both = pd.concat([meta_both, DTM], axis=1)

# changing all instances of "Reddit" to 0 and "NYT" to 1 under the "SOURCE" column
# for the following sklearn cells
dtm_both.loc[dtm_both.SOURCE == 'Reddit', 'SOURCE'] = 0
dtm_both.loc[dtm_both.SOURCE == 'NYT', 'SOURCE'] = 1

dtm_both.head(-20)


Unnamed: 0,FILENAME,SOURCE,TEXT,aaron,aba,aback,abandon,abandoned,abandoning,abandonment,...,zuma,zurich,zwilling,zyuganov,álvaro,ángel,édgar,élysée,état,óscar
0,REDDIT_2008_text/c035ug0.txt,0,On your social networks all your friends and c...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,REDDIT_2008_text/c039a2t.txt,0,Clearly Angela Durante is in dire need of a cu...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,REDDIT_2008_text/c03ad2h.txt,0,"If you enjoyed this video, you will love this ...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,REDDIT_2008_text/c03cnx9.txt,0,Seriously doubt truthnews factors into that.,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,REDDIT_2008_text/c03d5p6.txt,0,I have a major issue with the work skeptic the...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9974,NYT_2008/5/5481dcc338f0d874625c9456.txt,1,BAGHDAD — Prime Minister \nNuri Kamal al-Malik...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9975,NYT_2008/5/5481dc6a38f0d874625c944b.txt,1,Hoping to curb the increase in the number of y...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9976,NYT_2008/5/5481dc6538f0d874625c944a.txt,1,"BEIRUT, Lebanon — A gunman opened fire in a mo...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9977,NYT_2008/5/5481dc5d38f0d874625c9449.txt,1,PARIS — It is \nVladimir V. Putin\n’s first tr...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [144]:
# PROBLEM 1 contd (pt 5)

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

''' preparing for the ML part of the pset; creating training and test sets '''

# the x values are the words in the DTM
x_values = dtm_both.iloc[:, 3:].values.astype(float)

# the y values are the 0s and 1s representing whether a file is from reddit or nyt
y_values = dtm_both.iloc[:, 1].values.astype(float)

# splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(x_values, y_values, test_size=0.3)

model = LogisticRegression().fit(X_train, y_train)

# predicting class labels for the test set
predicted = model.predict(X_test)

# generating class probabilities
probs = model.predict_proba(X_test)

In [145]:
# PROBLEM 1 contd (pt 6)

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

''' testing the model using the test sets and the predicted class labels '''

# predicting class labels for the test set
predicted = model.predict(X_test)

# generating class probabilities
probs = model.predict_proba(X_test)

# evaluation metrics; accuracy + ROC
print("Success rate of classification: " + str(accuracy_score(y_test, predicted)))

# confusion matrix, F-1 score
print(confusion_matrix(y_test, predicted))
print(classification_report(y_test, predicted))

Success rate of classification: 0.9763333333333334
[[1485    6]
 [  65 1444]]
              precision    recall  f1-score   support

         0.0       0.96      1.00      0.98      1491
         1.0       1.00      0.96      0.98      1509

    accuracy                           0.98      3000
   macro avg       0.98      0.98      0.98      3000
weighted avg       0.98      0.98      0.98      3000



In [146]:
# PROBLEM 1 contd (pt 7)

import numpy as np

''' outputting the top features distinguishing files from reddit vs those from nyt '''

clf = LogisticRegression(penalty='l1', solver='liblinear')
clf.fit(X_train, y_train)
    
# keeping track of feature names and class labels
feature_names = dtm_both.columns[3:].values     
class_labels = dtm_both['SOURCE'].unique()

# getting the co-efficients for the features associated with nyt and matching them to their feature name
top20 = np.argsort(np.exp(clf.coef_))[0][-10:] 

# getting the co-efficients for the features associated with reddit matching them to their feature name
bottom20 = np.argsort(np.exp(clf.coef_))[0][:10]

# outputting the top 10 features associated with nyt and the top 10 associated with reddit
print("Top 10 features associated with second class (NYT)\n")
for el in zip(feature_names[top20], np.exp(clf.coef_)[0][top20]):
    print(el)
print("\n")
print("Top 10 features associated with first class (Reddit)\n")
for el in zip(feature_names[bottom20], np.exp(clf.coef_)[0][bottom20]):
    print(el)

Top 10 features associated with second class (NYT)

('david', 11.837559025308545)
('photo', 12.59629125194119)
('estate', 13.840026357659116)
('arab', 14.205758348857957)
('photograph', 22.83632120579337)
('grace', 23.348727039549573)
('fighting', 24.990698888769312)
('manhattan', 26.865768667457512)
('nytimes', 30.16152804155062)
('rep', 31.959556851618185)


Top 10 features associated with first class (Reddit)

('http', 0.012091556432413246)
('gt', 0.12503050284473832)
('actually', 0.2054899570363611)
('just', 0.2811349424141666)
('world', 0.33645469131491484)
('voting', 0.3664010556900865)
('shit', 0.4309272834377997)
('war', 0.4496562383351771)
('really', 0.45970140426577927)
('article', 0.46541394413643067)


In [147]:
# PROBLEM 2 pre-explanation

# what i thought
''' 
For the open-ended part of this probleem set, I thought that the differences between NYT articles and
Reddit posts would be pretty clear, given the drastic contrast in tone that I've personally experienced
when reading the former versus the latter. 

I thus decided to do two experiments:
1) Compare the superlative "best" adjectives between the two corpora
2) Compare the third-person singular verbs between the NYT articles and Reddit comments that got more downvotes than upvotes

My predictions:
1) Reddit would have a lot more negative superlative "best" adjectives (e.g., worst, dirtiest, grossest). Given that
these types of words generally convey a strong stance and Reddit is known to be a forum where individuals share their (often
strong) opinions, I thought it would make sense to see more negative superlative adjectives in Reddit posts than from copy-edited,
more professional news articles
2) Reddit would have more physically oriented third-person singular verbs. These verbs appear in sentences in which another person is
being described. Since these Reddit posts are taken from the News subreddit, I think people are more inclined to write about others
who have "done something," if that makes sense, compared to news articles which I view as more descriptive
'''

# what i did
'''
1) Basically, I recreated the same pipeline from question 1 but with a custom pre-processor that takes as input a list
of tuples (where the first element is the word and the second is its POS tag) and outputs the words with the "JJS" tag
as a string
2) For this experiment, I only used the Reddit posts with net negative votes. I did this by creating a new dataframe that isolated
rows with entries < 0 in the 'score' column of the original Reddit csv file. Then, I made another custom pre-processor that
takes as input a list of tuples (where the first element is the word and the second is its POS tag) and outputs the words with
the "VBZ" tag as a string
'''

# what happened
'''
See the reflections under each experiment's "top 10 features" cell
'''

'\nSee the reflections under each experiment\'s "top 10 features" cell\n'

In [148]:
# PROBLEM 2

from nltk.tokenize import word_tokenize

''' adding a new column to the dataframes for both corpora with their texts tokenized by words '''

# keeping the two dataframes separate so that I can the meta_nyt_cut one with a different version of
# reddit metadata later on without having to recompute WORD_TOKS and other info
meta_reddit_cut['WORD_TOKS'] = ''
meta_nyt_cut['WORD_TOKS'] = ''

# ----------------------------------- reddit ---------------------------------- #

for index, row in meta_reddit_cut.iterrows():
    # tokenizing each text's words, then saving these to a column
    meta_reddit_cut.at[index, 'WORD_TOKS'] = word_tokenize(row['TEXT'].lower())
    
# ------------------------------------- nyt ------------------------------------ #
    
for index, row in meta_nyt_cut.iterrows():
    # tokenizing each text's words, then saving these to a column
    meta_nyt_cut.at[index, 'WORD_TOKS'] = word_tokenize(row['TEXT'].lower())
    
meta_nyt_cut.head(10)
    

Unnamed: 0,FILENAME,SOURCE,TEXT,WORD_TOKS
0,NYT_2008/1/547c5dd838f0d813efccc063.txt,NYT,"DES MOINES — As Mitt Romney, Fred S. Thompson ...","[des, moines, —, as, mitt, romney, ,, fred, s...."
1,NYT_2008/1/547c5b2538f0d813efccc022.txt,NYT,DES MOINES — Dennis Kucinich today urged his ...,"[des, moines, —, dennis, kucinich, today, urge..."
2,NYT_2008/1/547c58a538f0d813efccbfde.txt,NYT,"PORT ST. LUCIE, Fla. — On Nov. 28, Marcia L. D...","[port, st., lucie, ,, fla., —, on, nov., 28, ,..."
3,NYT_2008/1/547c584738f0d813efccbfd4.txt,NYT,DES MOINES — The Democratic presidential candi...,"[des, moines, —, the, democratic, presidential..."
4,NYT_2008/1/547c5a9b38f0d813efccc017.txt,NYT,"STANLEY, N.D. — At dawn, people from faraway s...","[stanley, ,, n.d., —, at, dawn, ,, people, fro..."
5,NYT_2008/1/547c57c038f0d813efccbfc8.txt,NYT,DES MOINES — Spurred by a recent \nSupreme Cou...,"[des, moines, —, spurred, by, a, recent, supre..."
6,NYT_2008/1/547c579938f0d813efccbfc5.txt,NYT,DES MOINES — \nIowa\n is packed with president...,"[des, moines, —, iowa, is, packed, with, presi..."
7,NYT_2008/1/547c578d38f0d813efccbfc4.txt,NYT,"LOS ANGELES — Sara Jane Moore, a 1970s radical...","[los, angeles, —, sara, jane, moore, ,, a, 197..."
8,NYT_2008/1/547c575e38f0d813efccbfc0.txt,NYT,"DES MOINES — Just before Thanksgiving, \nMitt ...","[des, moines, —, just, before, thanksgiving, ,..."
9,NYT_2008/1/547c574938f0d813efccbfbe.txt,NYT,WASHINGTON — \nPakistan\n’s ambassador to the ...,"[washington, —, pakistan, ’, s, ambassador, to..."


In [149]:
# PROBLEM 2 contd (pt 2)

from nltk.tokenize import sent_tokenize
from nltk import pos_tag

''' adding a new column to the dataframes with each text's POS tags '''

meta_reddit_cut['POS_TAGS'] = ''
meta_nyt_cut['POS_TAGS'] = ''

# ----------------------------------- reddit ---------------------------------- #

for index, row in meta_reddit_cut.iterrows():
    
    # creating a list of sublists of POS tags where each sublist represents a sentence
    list_of_sentences = [pos_tag(word_tokenize(sent)) for sent in sent_tokenize(row['TEXT'])]
    
    # flattening the list of sublists into a simple list of POS tags
    meta_reddit_cut.at[index, 'POS_TAGS'] = [POS_tuple for sublist in list_of_sentences for POS_tuple in sublist]
    
# the above code runs a bit slow so here is a checkpoint
print("halfway there!")

# ------------------------------------- nyt ------------------------------------ #
    
for index, row in meta_nyt_cut.iterrows():
    
    # creating a list of sublists of POS tags where each sublist represents a sentence
    list_of_sentences = [pos_tag(word_tokenize(sent)) for sent in sent_tokenize(row['TEXT'])]
    
    # flattening the list of sublists into a simple list of POS tags
    meta_nyt_cut.at[index, 'POS_TAGS'] = [POS_tuple for sublist in list_of_sentences for POS_tuple in sublist]

# combining the metadata for reddit and the nyt into one dataframe;
# also renaming the filename column and resetting the indices
meta_both = pd.concat([meta_reddit_cut, meta_nyt_cut])
meta_both = meta_both.reset_index(drop=True)

meta_both.head(-20)

halfway there!


Unnamed: 0,FILENAME,SOURCE,TEXT,WORD_TOKS,POS_TAGS
0,REDDIT_2008_text/c035ug0.txt,Reddit,On your social networks all your friends and c...,"[on, your, social, networks, all, your, friend...","[(On, IN), (your, PRP$), (social, JJ), (networ..."
1,REDDIT_2008_text/c039a2t.txt,Reddit,Clearly Angela Durante is in dire need of a cu...,"[clearly, angela, durante, is, in, dire, need,...","[(Clearly, RB), (Angela, NNP), (Durante, NNP),..."
2,REDDIT_2008_text/c03ad2h.txt,Reddit,"If you enjoyed this video, you will love this ...","[if, you, enjoyed, this, video, ,, you, will, ...","[(If, IN), (you, PRP), (enjoyed, VBP), (this, ..."
3,REDDIT_2008_text/c03cnx9.txt,Reddit,Seriously doubt truthnews factors into that.,"[seriously, doubt, truthnews, factors, into, t...","[(Seriously, RB), (doubt, JJ), (truthnews, NNS..."
4,REDDIT_2008_text/c03d5p6.txt,Reddit,I have a major issue with the work skeptic the...,"[i, have, a, major, issue, with, the, work, sk...","[(I, PRP), (have, VBP), (a, DT), (major, JJ), ..."
...,...,...,...,...,...
9974,NYT_2008/5/5481dcc338f0d874625c9456.txt,NYT,BAGHDAD — Prime Minister \nNuri Kamal al-Malik...,"[baghdad, —, prime, minister, nuri, kamal, al-...","[(BAGHDAD, NNP), (—, NNP), (Prime, NNP), (Mini..."
9975,NYT_2008/5/5481dc6a38f0d874625c944b.txt,NYT,Hoping to curb the increase in the number of y...,"[hoping, to, curb, the, increase, in, the, num...","[(Hoping, VBG), (to, TO), (curb, VB), (the, DT..."
9976,NYT_2008/5/5481dc6538f0d874625c944a.txt,NYT,"BEIRUT, Lebanon — A gunman opened fire in a mo...","[beirut, ,, lebanon, —, a, gunman, opened, fir...","[(BEIRUT, NNP), (,, ,), (Lebanon, NNP), (—, VB..."
9977,NYT_2008/5/5481dc5d38f0d874625c9449.txt,NYT,PARIS — It is \nVladimir V. Putin\n’s first tr...,"[paris, —, it, is, vladimir, v., putin, ’, s, ...","[(PARIS, NNP), (—, NN), (It, PRP), (is, VBZ), ..."


In [150]:
# PROBLEM 2 contd (pt 3)

''' creating a custom pre-processor that only extracts words tagged with "JJS" (superlative adjectives) '''

# assumes that the input is a list of POS tags
def JJS_preprocessor(list_of_POS_tags):
    
    # a list of the words to output
    output_text_as_list = []
    
    for POS_tuple in list_of_POS_tags:
        
        # if the word in the current POS tuple has numbers in it, do not save it
        if(re.search('[0-9]', POS_tuple[0])):
            continue
        
        # if the word in the current POS tuple is a superlative adjective
        if (POS_tuple[1] in ['JJS']):
            
            # appends the word to a list of words to keep
            output_text_as_list.append(POS_tuple[0].lower())
    
    # returns a string with each superlative adjective separated by a space
    return " ".join(output_text_as_list)

In [151]:
# PROBLEM 2 contd (pt 4)

from nltk.corpus import words

''' re-doing the previous CountVectorizer step to create a new DTM with only superlative adjectives '''

# creating a new vecorizer
vectorizer = CountVectorizer(input='content', preprocessor=JJS_preprocessor, stop_words='english', min_df=2, encoding='utf8')
dtm = vectorizer.fit_transform(meta_both['POS_TAGS'])
vocab = vectorizer.get_feature_names()
matrix = dtm.toarray()

# combining the DTM with the metadata (the matrix of 0s and 1s with the vocabulary)
DTM = pd.DataFrame(matrix, columns=vocab)

# attaching the DTM to the original dataframe
new_dtm_both = pd.concat([meta_both, DTM], axis=1)

# changing all instances of "Reddit" to 0 and "NYT" to 1 under the "SOURCE" column
# for the following sklearn cells
new_dtm_both.loc[new_dtm_both.SOURCE == 'Reddit', 'SOURCE'] = 0
new_dtm_both.loc[new_dtm_both.SOURCE == 'NYT', 'SOURCE'] = 1

new_dtm_both.head(-20)

Unnamed: 0,FILENAME,SOURCE,TEXT,WORD_TOKS,POS_TAGS,arrest,behest,best,biggest,bitterest,...,weakest,wealthiest,west,wettest,whitest,widest,wildest,worst,wrest,youngest
0,REDDIT_2008_text/c035ug0.txt,0,On your social networks all your friends and c...,"[on, your, social, networks, all, your, friend...","[(On, IN), (your, PRP$), (social, JJ), (networ...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,REDDIT_2008_text/c039a2t.txt,0,Clearly Angela Durante is in dire need of a cu...,"[clearly, angela, durante, is, in, dire, need,...","[(Clearly, RB), (Angela, NNP), (Durante, NNP),...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,REDDIT_2008_text/c03ad2h.txt,0,"If you enjoyed this video, you will love this ...","[if, you, enjoyed, this, video, ,, you, will, ...","[(If, IN), (you, PRP), (enjoyed, VBP), (this, ...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,REDDIT_2008_text/c03cnx9.txt,0,Seriously doubt truthnews factors into that.,"[seriously, doubt, truthnews, factors, into, t...","[(Seriously, RB), (doubt, JJ), (truthnews, NNS...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,REDDIT_2008_text/c03d5p6.txt,0,I have a major issue with the work skeptic the...,"[i, have, a, major, issue, with, the, work, sk...","[(I, PRP), (have, VBP), (a, DT), (major, JJ), ...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9974,NYT_2008/5/5481dcc338f0d874625c9456.txt,1,BAGHDAD — Prime Minister \nNuri Kamal al-Malik...,"[baghdad, —, prime, minister, nuri, kamal, al-...","[(BAGHDAD, NNP), (—, NNP), (Prime, NNP), (Mini...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9975,NYT_2008/5/5481dc6a38f0d874625c944b.txt,1,Hoping to curb the increase in the number of y...,"[hoping, to, curb, the, increase, in, the, num...","[(Hoping, VBG), (to, TO), (curb, VB), (the, DT...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9976,NYT_2008/5/5481dc6538f0d874625c944a.txt,1,"BEIRUT, Lebanon — A gunman opened fire in a mo...","[beirut, ,, lebanon, —, a, gunman, opened, fir...","[(BEIRUT, NNP), (,, ,), (Lebanon, NNP), (—, VB...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9977,NYT_2008/5/5481dc5d38f0d874625c9449.txt,1,PARIS — It is \nVladimir V. Putin\n’s first tr...,"[paris, —, it, is, vladimir, v., putin, ’, s, ...","[(PARIS, NNP), (—, NN), (It, PRP), (is, VBZ), ...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [152]:
# PROBLEM 2 contd (pt 5)

''' preparing for the ML part of the pset; creating training and test sets '''

# the x values are the superlative adjectives
x_values = new_dtm_both.iloc[:, 5:].values.astype(float)

# the y values are the 0s and 1s representing whether a file is from reddit or nyt
y_values = new_dtm_both.iloc[:, 1].values.astype(float)

# splitting the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(x_values, y_values, test_size=0.3)

model = LogisticRegression().fit(X_train, y_train)

# predicting class labels for the test set
predicted = model.predict(X_test)

# generating class probabilities
probs = model.predict_proba(X_test)

# Evaluation metrics; accuracy + ROC
print("Success rate of classification: " + str(accuracy_score(y_test, predicted)))

# Confusion matrix, F-1 score
print(confusion_matrix(y_test, predicted))
print(classification_report(y_test, predicted))

Success rate of classification: 0.7286666666666667
[[1463   42]
 [ 772  723]]
              precision    recall  f1-score   support

         0.0       0.65      0.97      0.78      1505
         1.0       0.95      0.48      0.64      1495

    accuracy                           0.73      3000
   macro avg       0.80      0.73      0.71      3000
weighted avg       0.80      0.73      0.71      3000



In [153]:
# PROBLEM 2 contd (pt 6)

''' outputting the top superlative adjectives distinguishing files from reddit vs those from nyt '''

clf = LogisticRegression(penalty='l1', solver='liblinear') # penalty='l1', C=0.1
clf.fit(X_train, y_train)
    
# keeping track of feature names and class labels
feature_names = new_dtm_both.columns[3:].values     
class_labels = new_dtm_both['SOURCE'].unique()

# getting the co-efficients for the features associated with nyt and matching them to their feature name
top20 = np.argsort(np.exp(clf.coef_))[0][-10:] 

# getting the co-efficients for the features associated with reddit matching them to their feature name
bottom20 = np.argsort(np.exp(clf.coef_))[0][:10]

# outputting the top 10 features associated with nyt and the top 10 associated with reddit
print("Top 10 features associated with second class (NYT)\n")
for el in zip(feature_names[top20], np.exp(clf.coef_)[0][top20]):
    print(el)
print("\n")
print("Top 10 features associated with first class (Reddit)\n")
for el in zip(feature_names[bottom20], np.exp(clf.coef_)[0][bottom20]):
    print(el)

Top 10 features associated with second class (NYT)

('nicest', 19.08358818156899)
('thorniest', 19.614290010682904)
('behest', 20.417162506292197)
('worst', 21.990658672159253)
('weakest', 24.95521994676611)
('cleanest', 25.206367264325735)
('inquest', 36.39991961445652)
('invest', 55.10351668562152)
('stickiest', 55.34939837760005)
('healthiest', 90.34219795115624)


Top 10 features associated with first class (Reddit)

('farthest', 0.1302119765884973)
('saddest', 1.0)
('safest', 1.0)
('gravest', 1.0)
('savviest', 1.0)
('furthest', 1.0)
('fullest', 1.0)
('richest', 1.0)
('fourth', 1.0)
('foremost', 1.0)


In [154]:
# PROBLEM 2 contd -- REFLECTION

# What happened
'''
Recall: I hypothesized that Reddit would have a lot more negative superlative "best" adjectives (e.g., worst, dirtiest, grossest).
Instead, it seems that there are no distinguishing features for Reddit besides "farthest", which is nevertheless rather
close to the "baseline" (or no impact) value of 1.0. Instead, it seems that it is the NYT articles that are more heavily
defined by superlative "best" adjectives, which is odd as I would think that these would simply describe situations rather than
take a specific value judgment. For me, at least, this is pretty interesting! Perhaps a similar line of logic may bear a more
rigorous experiment supporting our common intuition that the news isn't as impartial as it seems.
'''


'\nRecall: I hypothesized that Reddit would have a lot more negative superlative "best" adjectives (e.g., worst, dirtiest, grossest).\nInstead, it seems that there are no distinguishing features for Reddit besides "strictest", which is nevertheless rather\nclose to the "baseline" (or no impact) value of 1.0. Instead, it seems that it is the NYT artciles that are more heavily\ndefined by superlative "best" adjectives, which is odd as I would think that these would simply describe situations rather than\ntake a specific value judgment. For me, at least, this is pretty interesting! Perhaps a similar line of logic may bear a more\nrigorous experiment supporting our common intuition that the news isn\'t as impartial as it seems.\n'

In [168]:
# PROBLEM 2 contd (pt 7)

''' redoing the above, but only with reddit posts that have net negative votes '''

# making a new dataframe
temp_meta_reddit = pd.read_csv(corpus_path + 'REDDIT_news_2008_meta.csv', encoding='latin1')

# saving only the reddit posts with net negative votes
meta_reddit_negative = temp_meta_reddit.loc[temp_meta_reddit['score'] < 0]

# deleting all columns of the original csv file except for the filename
meta_reddit_negative = meta_reddit_negative[['file_id']].rename(columns = {'file_id': "FILENAME"})

# creating a new column outlining the source of the files (all from reddit)
meta_reddit_negative['SOURCE'] = 'Reddit'

# resetting the index
meta_reddit_negative = meta_reddit_negative.reset_index(drop=True)

# adding a new column to the dataframe for each file's full text and tokenized words
meta_reddit_negative['TEXT'] = ''
meta_reddit_negative['WORD_TOKS'] = ''

for index, row in meta_reddit_negative.iterrows():
    
    # making a new filename that includes the file's path
    filepath_name = 'REDDIT_2008_text/' + row['FILENAME']
    text = codecs.open(corpus_path + filepath_name, "r", encoding='utf8').read()
    
    # saving the new filename and the full text of the current reddit post
    meta_reddit_negative.at[index, 'FILENAME'] = filepath_name
    meta_reddit_negative.at[index, 'TEXT'] = text
    
    # saving the file's text tokenized by word
    meta_reddit_negative.at[index, 'WORD_TOKS'] = word_tokenize(row['TEXT'].lower())

meta_reddit_negative.head(10)

Unnamed: 0,FILENAME,SOURCE,TEXT,WORD_TOKS
0,REDDIT_2008_text/c03hiq5.txt,Reddit,"""She just took the kids and left. After that I...","[``, she, just, took, the, kids, and, left, .,..."
1,REDDIT_2008_text/c03hjlb.txt,Reddit,Well then do your duty and put a bullet in you...,"[well, then, do, your, duty, and, put, a, bull..."
2,REDDIT_2008_text/c03is70.txt,Reddit,Sweet.,"[sweet, .]"
3,REDDIT_2008_text/c03kear.txt,Reddit,Stop acting like this girl didn't deserve it. ...,"[stop, acting, like, this, girl, did, n't, des..."
4,REDDIT_2008_text/c03ppth.txt,Reddit,It's just underreported because most of the cr...,"[it, 's, just, underreported, because, most, o..."
5,REDDIT_2008_text/c03px8l.txt,Reddit,"Not only that, we built it so god knows what k...","[not, only, that, ,, we, built, it, so, god, k..."
6,REDDIT_2008_text/c03raq0.txt,Reddit,Still think we aren't living in a police state...,"[still, think, we, are, n't, living, in, a, po..."
7,REDDIT_2008_text/c03tpbr.txt,Reddit,"Yeah, right! Consider the source: The Washingt...","[yeah, ,, right, !, consider, the, source, :, ..."
8,REDDIT_2008_text/c03wgbu.txt,Reddit,A very effective ban is already in place: insu...,"[a, very, effective, ban, is, already, in, pla..."
9,REDDIT_2008_text/c03xc1s.txt,Reddit,I just can't believe so many Americans are hur...,"[i, just, ca, n't, believe, so, many, american..."


In [169]:
# PROBLEM 2 contd (pt 8)

''' saving the POS tags for each reddit post with net negative votes '''

meta_reddit_negative['POS_TAGS'] = ''

for index, row in meta_reddit_negative.iterrows():
    
    # creating a list of sublists of POS tags where each sublist represents a sentence
    list_of_sentences = [pos_tag(word_tokenize(sent)) for sent in sent_tokenize(row['TEXT'])]
    
    # flattening the list of sublists into a simple list of POS tags
    meta_reddit_negative.at[index, 'POS_TAGS'] = [POS_tuple for sublist in list_of_sentences for POS_tuple in sublist]
    
# combining the metadata for negatively-voted reddit posts and the nyt into one dataframe
meta_both = pd.concat([meta_reddit_negative, meta_nyt_cut])
meta_both = meta_both.reset_index(drop=True)

meta_both.head(-20)

Unnamed: 0,FILENAME,SOURCE,TEXT,WORD_TOKS,POS_TAGS
0,REDDIT_2008_text/c03hiq5.txt,Reddit,"""She just took the kids and left. After that I...","[``, she, just, took, the, kids, and, left, .,...","[(``, ``), (She, PRP), (just, RB), (took, VBD)..."
1,REDDIT_2008_text/c03hjlb.txt,Reddit,Well then do your duty and put a bullet in you...,"[well, then, do, your, duty, and, put, a, bull...","[(Well, RB), (then, RB), (do, VB), (your, PRP$..."
2,REDDIT_2008_text/c03is70.txt,Reddit,Sweet.,"[sweet, .]","[(Sweet, NNP), (., .)]"
3,REDDIT_2008_text/c03kear.txt,Reddit,Stop acting like this girl didn't deserve it. ...,"[stop, acting, like, this, girl, did, n't, des...","[(Stop, NNP), (acting, VBG), (like, IN), (this..."
4,REDDIT_2008_text/c03ppth.txt,Reddit,It's just underreported because most of the cr...,"[it, 's, just, underreported, because, most, o...","[(It, PRP), ('s, VBZ), (just, RB), (underrepor..."
...,...,...,...,...,...
6157,NYT_2008/5/5481dcc338f0d874625c9456.txt,NYT,BAGHDAD — Prime Minister \nNuri Kamal al-Malik...,"[baghdad, —, prime, minister, nuri, kamal, al-...","[(BAGHDAD, NNP), (—, NNP), (Prime, NNP), (Mini..."
6158,NYT_2008/5/5481dc6a38f0d874625c944b.txt,NYT,Hoping to curb the increase in the number of y...,"[hoping, to, curb, the, increase, in, the, num...","[(Hoping, VBG), (to, TO), (curb, VB), (the, DT..."
6159,NYT_2008/5/5481dc6538f0d874625c944a.txt,NYT,"BEIRUT, Lebanon — A gunman opened fire in a mo...","[beirut, ,, lebanon, —, a, gunman, opened, fir...","[(BEIRUT, NNP), (,, ,), (Lebanon, NNP), (—, VB..."
6160,NYT_2008/5/5481dc5d38f0d874625c9449.txt,NYT,PARIS — It is \nVladimir V. Putin\n’s first tr...,"[paris, —, it, is, vladimir, v., putin, ’, s, ...","[(PARIS, NNP), (—, NN), (It, PRP), (is, VBZ), ..."


In [170]:
# PROBLEM 2 contd (pt 9)

''' creating a custom pre-processor that only extracts words tagged with "VBZ" (third person singular verb) '''

# assumes that the input is a list of POS tags
def VBZ_preprocessor(list_of_POS_tags):
    
    # a list of the words to output
    output_text_as_list = []
    
    for POS_tuple in list_of_POS_tags:
        
        # if the word in the current POS tuple has numbers in it, do not save it
        if(re.search('[0-9]', POS_tuple[0])):
            continue
        
        # if the word in the current POS tuple is a third person singular verb
        if (POS_tuple[1] in ['VBZ']):
            
            # appends the word to a list of words to keep
            output_text_as_list.append(POS_tuple[0].lower())
    
    # returns a string with each superlative adjective separated by a space
    return " ".join(output_text_as_list)

In [171]:
# PROBLEM 2 contd (pt 10)

from nltk.corpus import words

''' re-doing the previous CountVectorizer step to create a new DTM with only third person singular verbs '''

# creating a new vecorizer
vectorizer = CountVectorizer(input='content', preprocessor=VBZ_preprocessor, stop_words='english', min_df=2, encoding='utf8')
dtm = vectorizer.fit_transform(meta_both['POS_TAGS'])
vocab = vectorizer.get_feature_names()
matrix = dtm.toarray()

# combining the DTM with the metadata (the matrix of 0s and 1s with the vocabulary)
DTM = pd.DataFrame(matrix, columns=vocab)

# attaching the DTM to the original dataframe
new_dtm_both = pd.concat([meta_both, DTM], axis=1)

# changing all instances of "Reddit" to 0 and "NYT" to 1 under the "SOURCE" column
# for the following sklearn cells
new_dtm_both.loc[new_dtm_both.SOURCE == 'Reddit', 'SOURCE'] = 0
new_dtm_both.loc[new_dtm_both.SOURCE == 'NYT', 'SOURCE'] = 1

new_dtm_both.head(-20)

Unnamed: 0,FILENAME,SOURCE,TEXT,WORD_TOKS,POS_TAGS,abandons,abets,ablaze,absurd,abuses,...,worries,worsens,worth,wouldn,wounds,wraps,writes,yankees,yields,zones
0,REDDIT_2008_text/c03hiq5.txt,0,"""She just took the kids and left. After that I...","[``, she, just, took, the, kids, and, left, .,...","[(``, ``), (She, PRP), (just, RB), (took, VBD)...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,REDDIT_2008_text/c03hjlb.txt,0,Well then do your duty and put a bullet in you...,"[well, then, do, your, duty, and, put, a, bull...","[(Well, RB), (then, RB), (do, VB), (your, PRP$...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,REDDIT_2008_text/c03is70.txt,0,Sweet.,"[sweet, .]","[(Sweet, NNP), (., .)]",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,REDDIT_2008_text/c03kear.txt,0,Stop acting like this girl didn't deserve it. ...,"[stop, acting, like, this, girl, did, n't, des...","[(Stop, NNP), (acting, VBG), (like, IN), (this...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,REDDIT_2008_text/c03ppth.txt,0,It's just underreported because most of the cr...,"[it, 's, just, underreported, because, most, o...","[(It, PRP), ('s, VBZ), (just, RB), (underrepor...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6157,NYT_2008/5/5481dcc338f0d874625c9456.txt,1,BAGHDAD — Prime Minister \nNuri Kamal al-Malik...,"[baghdad, —, prime, minister, nuri, kamal, al-...","[(BAGHDAD, NNP), (—, NNP), (Prime, NNP), (Mini...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6158,NYT_2008/5/5481dc6a38f0d874625c944b.txt,1,Hoping to curb the increase in the number of y...,"[hoping, to, curb, the, increase, in, the, num...","[(Hoping, VBG), (to, TO), (curb, VB), (the, DT...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6159,NYT_2008/5/5481dc6538f0d874625c944a.txt,1,"BEIRUT, Lebanon — A gunman opened fire in a mo...","[beirut, ,, lebanon, —, a, gunman, opened, fir...","[(BEIRUT, NNP), (,, ,), (Lebanon, NNP), (—, VB...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6160,NYT_2008/5/5481dc5d38f0d874625c9449.txt,1,PARIS — It is \nVladimir V. Putin\n’s first tr...,"[paris, —, it, is, vladimir, v., putin, ’, s, ...","[(PARIS, NNP), (—, NN), (It, PRP), (is, VBZ), ...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [172]:
# PROBLEM 2 contd (pt 11)

''' preparing for the ML part of the pset; creating training and test sets '''

# the x values are the words in the DTM
x_values = new_dtm_both.iloc[:, 5:].values.astype(float)

# the y values are the words in the DTM
y_values = new_dtm_both.iloc[:, 1].values.astype(float)

# splitting the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(x_values, y_values, test_size=0.3)

model = LogisticRegression().fit(X_train, y_train)

# predicting class labels for the test set
predicted = model.predict(X_test)

# generating class probabilities
probs = model.predict_proba(X_test)

# Evaluation metrics; accuracy + ROC
print("Success rate of classification: " + str(accuracy_score(y_test, predicted)))

# Confusion matrix, F-1 score
print(confusion_matrix(y_test, predicted))
print(classification_report(y_test, predicted))

Success rate of classification: 0.8075471698113208
[[   7  350]
 [   7 1491]]
              precision    recall  f1-score   support

         0.0       0.50      0.02      0.04       357
         1.0       0.81      1.00      0.89      1498

    accuracy                           0.81      1855
   macro avg       0.65      0.51      0.47      1855
weighted avg       0.75      0.81      0.73      1855



In [173]:
# PROBLEM 2 contd (pt 12)

''' outputting the top third person singular verbs distinguishing negatively-voted reddit posts vs nyt articles '''

clf = LogisticRegression(penalty='l1', solver='liblinear') # penalty='l1', C=0.1
clf.fit(X_train, y_train)
    
# keeping track of feature names and class labels
feature_names = new_dtm_both.columns[3:].values     
class_labels = new_dtm_both['SOURCE'].unique()

# getting the co-efficients for the features associated with nyt and matching them to their feature name
top20 = np.argsort(np.exp(clf.coef_))[0][-10:] 

# getting the co-efficients for the features associated with reddit matching them to their feature name
bottom20 = np.argsort(np.exp(clf.coef_))[0][:10]

# outputting the top 10 features associated with nyt and the top 10 associated with reddit
print("Top 10 features associated with second class (NYT)\n")
for el in zip(feature_names[top20], np.exp(clf.coef_)[0][top20]):
    print(el)
print("\n")
print("Top 10 features associated with first class (Reddit)\n")
for el in zip(feature_names[bottom20], np.exp(clf.coef_)[0][bottom20]):
    print(el)

Top 10 features associated with second class (NYT)

('protests', 9.204333176713419)
('focuses', 9.224969448927013)
('allies', 9.929720141541189)
('imposes', 10.529795803295354)
('hints', 11.122221743701092)
('anymore', 12.43151833818894)
('represents', 12.588504090219852)
('listens', 13.42851850050355)
('extremists', 29.123866625380735)
('relies', 32.32691381392926)


Top 10 features associated with first class (Reddit)

('subjects', 0.1382172079094966)
('harms', 0.3641719169289015)
('tastes', 0.4355412994374665)
('crops', 0.6072723745031644)
('urges', 0.6893698615671414)
('soars', 0.6962809281120079)
('gazes', 0.926521180101566)
('preaches', 1.0)
('precedes', 1.0)
('praises', 1.0)


In [161]:
# PROBLEM 2 contd -- REFLECTION

# What happened
'''
Recall: I hypothesized that  Reddit would have more physically oriented third-person singular verbs. Indeed, it seems that I 
was correct, in part. In the NYT, I'd venture that the only objectively physical verb is "listens" and perhaps "focuses", 
with the rest of the features being relatively abstract ("oversees", "treasures", "relies"). Now, compare these with 
"harms", "tastes", "urges", and "gazes" from Reddit -- all verbs that seem to denote a certain extent of physical embodiment.
That being said, these were not the kinds of physically oriented verbs I expected! Since these features are distinguishing the
negatively voted Reddit posts, I envisioned stronger verbs like "ruins","destroys", "fights" -- verbs that convey a staunch
political opinion that might get one downvoted. Instead, here we have some rather reflective and seemingly temperate verbs
(with the exception of "harms"), which makes me re-evaluate my conception of posts that are unpopular with the general public.
'''

'\nRecall: I hypothesized that  Reddit would have more physically oriented third-person singular verbs. Indeed, it seems that I \nwas correct, in part. In the NYT, I\'d venture that the only objectively physical verb is "listens" and perhaps "focuses", \nwith the rest of the features being relatively abstract ("contrasts", "hints", "relies"). Now, compare these with \n"tastes", "gazes", "urges", and "reflects" from Reddit -- all verbs that seem to denote a certain extent of physical embodiment.\nThat being said, these were not the kinds of physically oriented verbs I expected! Since these features are distinguishing the\nnegatively voted Reddit posts, I envisioned stronger verbs like "ruins","destroys", "fights" -- verbs that convey a staunch\npolitical opinion that might get one downvoted. Instead, here we have some rather reflective and seemingly temperate verbs, which\nmakes me re-evaluate my conception of posts that are unpopular with the general public.\n'