## Importing Libraries

In [1]:
import praw
from praw.models import MoreComments
import pandas as pd 
import numpy as np
from datetime import datetime
import sys
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
import pprint
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk
from gensim import corpora, models

unable to import 'smart_open.gcs', disabling that module


## Define Function

In [2]:
def store_link_data(data, index):
    ls = []
    for submission in data:
        ls.append([submission.subreddit_name_prefixed,
                   submission.title, 
                   submission.selftext,
                   datetime.fromtimestamp(submission.created_utc), 
                   submission.upvote_ratio, 
                   submission.score, 
                   submission.permalink,
                   submission.comment_limit,
                   submission.num_comments])
    
    df = pd.DataFrame(ls,
                      columns=['SubReddit',
                               'Title', 
                               'Title_Description',
                               'Creation_Date', 
                               'Up_Vote_Ratio', 
                               'Score', 
                               'Permalink',
                               'Comments',
                               'Comment_Counts'],
                      index = np.linspace(index, index + len(ls)-1, len(ls)))
    
    return df

In [3]:
## REF: https://nlpforhackers.io/topic-modeling/
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic:  ", idx)
      
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
                        ## gets top n elements in decreasing order

In [4]:
def lemmatize_stemming(text):
    stemmer = SnowballStemmer("english")
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

In [5]:
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

## Initializing Session

In [6]:
cid = #enter your own
csec = #enter your own
ua = #enter your own

In [7]:
reddit = praw.Reddit(client_id= cid,
                     client_secret= csec,
                     user_agent= ua)

print(reddit.read_only)

True


## Gathering Data

In [8]:
#Display only
data = reddit.subreddit('dogman').top(limit = 2)
for submission in data:
    pprint.pprint(vars(submission))

{'_comments_by_id': {},
 '_fetched': False,
 '_reddit': <praw.reddit.Reddit object at 0x1a19a2edd0>,
 'all_awardings': [],
 'allow_live_comments': False,
 'approved_at_utc': None,
 'approved_by': None,
 'archived': False,
 'author': Redditor(name='oceang1rl'),
 'author_flair_background_color': None,
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_template_id': None,
 'author_flair_text': None,
 'author_flair_text_color': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_4kqvkemc',
 'author_patreon_flair': False,
 'author_premium': False,
 'awarders': [],
 'banned_at_utc': None,
 'banned_by': None,
 'can_gild': False,
 'can_mod_post': False,
 'category': None,
 'clicked': False,
 'comment_limit': 2048,
 'comment_sort': 'confidence',
 'content_categories': None,
 'contest_mode': False,
 'created': 1589362472.0,
 'created_utc': 1589333672.0,
 'discussion_type': None,
 'distinguished': None,
 'domain': 'i.redd.it',
 'downs': 0,
 'edited': False,
 'gi

In [9]:
del data

data = reddit.subreddit('cooking').new(limit = 20)
df = store_link_data(data,0)

data = reddit.subreddit('beer').hot(limit = 20)
df = df.append(store_link_data(data, len(df.index)))
    
data = reddit.subreddit('computer').rising(limit = 20)
df = df.append(store_link_data(data,len(df.index)))
    
data = reddit.subreddit('investing').top(limit = 20)
df = df.append(store_link_data(data,len(df.index)))

print(df.head)

<bound method NDFrame.head of         SubReddit                                              Title  \
0.0     r/Cooking                     Healthy Non Fried McPizza Puff   
1.0     r/Cooking                        Help with chimichurri steak   
2.0     r/Cooking  Am I supposed to add salt to the water before ...   
3.0     r/Cooking  Any other former line cooks out there that tra...   
4.0     r/Cooking  Anyone know of an app or website that can spec...   
...           ...                                                ...   
75.0  r/investing  John Bogle, who founded Vanguard and revolutio...   
76.0  r/investing  The lawyer who took on Big Tobacco and Enron i...   
77.0  r/investing  Today's stock market crash was worse than the ...   
78.0  r/investing  "Brands don’t need Amazon". Nike's decision to...   
79.0  r/investing  70% of new home purchases in China are second ...   

                                      Title_Description       Creation_Date  \
0.0   Easiest Pizza McPuff

In [10]:
for x in range(len(df)):
    url = "https://www.reddit.com" + df.iloc[x, 6]
    submission = reddit.submission(url=url)
    #submission.comment_sort = "top"
    counting = 1
    submission.comments.replace_more(limit=None)
    for comment in submission.comments:
        if isinstance(comment, MoreComments):
            continue
        if counting == 1:
            comt = ''
        comt = comt + 'Author: {}, Ups: {}, Date: {}, \nComment: {}\n\n'.format(comment.author,
                                                                      comment.ups,
                                                                      datetime.fromtimestamp(submission.created_utc),
                                                                      comment.body)
        counting = counting + 1
        
    if counting > 1:
        df.iloc[x,7] = comt
        df.iloc[x,8] = counting - 1
            
    else:
        df.iloc[x,7] = ''
        df.iloc[x,8] = 0


In [11]:
for x in range(len(df)):
    url = "https://www.reddit.com" + df.iloc[x, 6]
    submission = reddit.submission(url=url)
    submission.comments.replace_more(limit=None)
    ls = []
    for comment in submission.comments:
        if isinstance(comment, MoreComments):
            continue
        ls.append([comment.subreddit_name_prefixed,comment.body])
    
    if x is 0:
        tiny_df = pd.DataFrame(ls, columns=['LABEL','TEXT'],
                               index = np.linspace(0, len(ls)-1, len(ls)))
    else:
        df2 = pd.DataFrame(ls, columns=['LABEL', 'TEXT'], 
                           index = np.linspace(0, len(ls)-1, len(ls)))
        tiny_df = tiny_df.append(df2, ignore_index=True)

In [12]:
for x in range(len(df)):
    df.iloc[x,2] = df.iloc[x,1] + ' ' + df.iloc[x,2]

## Models for Titles

In [13]:
ngram_vectorizer = CountVectorizer(binary=False, stop_words = "english",ngram_range=(1, 2))
ngram_vectorizer.fit(df['Title_Description'])
x_train, x_test, y_train, y_test = train_test_split(df['Title_Description'], df['SubReddit'], train_size = 0.65,random_state = 0)
ColumnNames = ngram_vectorizer.get_feature_names()
X_TRAIN = ngram_vectorizer.transform(x_train)
TDM1=pd.DataFrame(X_TRAIN.toarray(),columns=ColumnNames)
X_TEST = ngram_vectorizer.transform(x_test)
TDM2=pd.DataFrame(X_TEST.toarray(),columns=ColumnNames)

for a in ['poly', 'rbf','linear']:
    for c in [0.01, 1, 10,100]: 
        svm = SVC(kernel = a,C = c, gamma = 'auto')
        svm.fit(TDM1, y_train)
        print ("Kernel %s with Accuracy for C=%s:%s" 
               % (a , c ,accuracy_score(y_test, svm.predict(TDM2))))

nb_clf= MultinomialNB()
scores = cross_val_score(nb_clf, ngram_vectorizer.transform(df['Title_Description']), df['SubReddit'], cv=10)
avg=sum(scores)/len(scores)
print(avg)



Kernel poly with Accuracy for C=0.01:0.14285714285714285
Kernel poly with Accuracy for C=1:0.14285714285714285
Kernel poly with Accuracy for C=10:0.14285714285714285
Kernel poly with Accuracy for C=100:0.14285714285714285
Kernel rbf with Accuracy for C=0.01:0.14285714285714285
Kernel rbf with Accuracy for C=1:0.14285714285714285
Kernel rbf with Accuracy for C=10:0.17857142857142858
Kernel rbf with Accuracy for C=100:0.6071428571428571
Kernel linear with Accuracy for C=0.01:0.35714285714285715
Kernel linear with Accuracy for C=1:0.5357142857142857
Kernel linear with Accuracy for C=10:0.5357142857142857
Kernel linear with Accuracy for C=100:0.5357142857142857
0.775


In [14]:
ngram_vectorizer = TfidfVectorizer(binary=False, stop_words = "english",ngram_range=(1, 2))
ngram_vectorizer.fit(df['Title_Description'])
x_train, x_test, y_train, y_test = train_test_split(df['Title_Description'], df['SubReddit'], train_size = 0.65,random_state = 0)
ColumnNames = ngram_vectorizer.get_feature_names()
X_TRAIN = ngram_vectorizer.transform(x_train)
TDM1=pd.DataFrame(X_TRAIN.toarray(),columns=ColumnNames)
X_TEST = ngram_vectorizer.transform(x_test)
TDM2=pd.DataFrame(X_TEST.toarray(),columns=ColumnNames)

for a in ['poly', 'rbf','linear']:
    for c in [0.01, 1, 10,100]: 
        svm = SVC(kernel = a,C = c, gamma = 'auto')
        svm.fit(TDM1, y_train)
        print ("Kernel %s with Accuracy for C=%s:%s" 
               % (a , c ,accuracy_score(y_test, svm.predict(TDM2))))

nb_clf= MultinomialNB()
scores = cross_val_score(nb_clf, ngram_vectorizer.transform(df['Title_Description']), df['SubReddit'], cv=10)
avg=sum(scores)/len(scores)
print(avg)



Kernel poly with Accuracy for C=0.01:0.14285714285714285
Kernel poly with Accuracy for C=1:0.14285714285714285
Kernel poly with Accuracy for C=10:0.14285714285714285
Kernel poly with Accuracy for C=100:0.14285714285714285
Kernel rbf with Accuracy for C=0.01:0.14285714285714285
Kernel rbf with Accuracy for C=1:0.14285714285714285
Kernel rbf with Accuracy for C=10:0.14285714285714285
Kernel rbf with Accuracy for C=100:0.14285714285714285
Kernel linear with Accuracy for C=0.01:0.14285714285714285
Kernel linear with Accuracy for C=1:0.4642857142857143
Kernel linear with Accuracy for C=10:0.4642857142857143
Kernel linear with Accuracy for C=100:0.4642857142857143
0.85


In [15]:
ngram_vectorizer = CountVectorizer(binary=True, stop_words = "english",ngram_range=(1, 2))
ngram_vectorizer.fit(df['Title_Description'])
x_train, x_test, y_train, y_test = train_test_split(df['Title_Description'], df['SubReddit'], train_size = 0.65,random_state = 0)
ColumnNames = ngram_vectorizer.get_feature_names()
X_TRAIN = ngram_vectorizer.transform(x_train)
TDM1=pd.DataFrame(X_TRAIN.toarray(),columns=ColumnNames)
X_TEST = ngram_vectorizer.transform(x_test)
TDM2=pd.DataFrame(X_TEST.toarray(),columns=ColumnNames)

for a in ['poly', 'rbf','linear']:
    for c in [0.01, 1, 10,100]: 
        svm = SVC(kernel = a,C = c, gamma = 'auto')
        svm.fit(TDM1, y_train)
        print ("Kernel %s with Accuracy for C=%s:%s" 
               % (a , c ,accuracy_score(y_test, svm.predict(TDM2))))

nb_clf= MultinomialNB()
scores = cross_val_score(nb_clf, ngram_vectorizer.transform(df['Title_Description']), df['SubReddit'], cv=10)
avg=sum(scores)/len(scores)
print(avg)



Kernel poly with Accuracy for C=0.01:0.14285714285714285
Kernel poly with Accuracy for C=1:0.14285714285714285
Kernel poly with Accuracy for C=10:0.14285714285714285
Kernel poly with Accuracy for C=100:0.14285714285714285
Kernel rbf with Accuracy for C=0.01:0.14285714285714285
Kernel rbf with Accuracy for C=1:0.14285714285714285
Kernel rbf with Accuracy for C=10:0.14285714285714285
Kernel rbf with Accuracy for C=100:0.42857142857142855
Kernel linear with Accuracy for C=0.01:0.17857142857142858
Kernel linear with Accuracy for C=1:0.5
Kernel linear with Accuracy for C=10:0.5
Kernel linear with Accuracy for C=100:0.5
0.7375


## Models for Comments

In [16]:
ngram_vectorizer = CountVectorizer(binary=False, stop_words = "english",ngram_range=(1, 2), max_features = 700)
ngram_vectorizer.fit(tiny_df['TEXT'])
x_train, x_test, y_train, y_test = train_test_split(tiny_df['TEXT'], tiny_df['LABEL'], train_size = 0.65,random_state = 0)
ColumnNames = ngram_vectorizer.get_feature_names()
X_TRAIN = ngram_vectorizer.transform(x_train)
TDM1=pd.DataFrame(X_TRAIN.toarray(),columns=ColumnNames)
X_TEST = ngram_vectorizer.transform(x_test)
TDM2=pd.DataFrame(X_TEST.toarray(),columns=ColumnNames)

for a in ['poly', 'rbf','linear']:
    for c in [0.01, 1, 10,100]: 
        svm = SVC(kernel = a,C = c, gamma = 'auto')
        svm.fit(TDM1, y_train)
        print ("Kernel %s with Accuracy for C=%s:%s" 
               % (a , c ,accuracy_score(y_test, svm.predict(TDM2))))

nb_clf= MultinomialNB()
scores = cross_val_score(nb_clf, ngram_vectorizer.transform(tiny_df['TEXT']), tiny_df['LABEL'], cv=10)
avg=sum(scores)/len(scores)
print(avg)



Kernel poly with Accuracy for C=0.01:0.8918387413962635
Kernel poly with Accuracy for C=1:0.8918387413962635
Kernel poly with Accuracy for C=10:0.8918387413962635
Kernel poly with Accuracy for C=100:0.8918387413962635
Kernel rbf with Accuracy for C=0.01:0.8918387413962635
Kernel rbf with Accuracy for C=1:0.9006882989183874
Kernel rbf with Accuracy for C=10:0.9124877089478859
Kernel rbf with Accuracy for C=100:0.9252704031465093
Kernel linear with Accuracy for C=0.01:0.9065880039331367
Kernel linear with Accuracy for C=1:0.9183874139626352
Kernel linear with Accuracy for C=10:0.880039331366765
Kernel linear with Accuracy for C=100:0.8751229105211407
0.8969337059419253


In [17]:
ngram_vectorizer = TfidfVectorizer(binary=False, stop_words = "english",ngram_range=(1, 2), max_features = 700)
ngram_vectorizer.fit(tiny_df['TEXT'])
x_train, x_test, y_train, y_test = train_test_split(tiny_df['TEXT'], tiny_df['LABEL'], train_size = 0.65,random_state = 0)
ColumnNames = ngram_vectorizer.get_feature_names()
X_TRAIN = ngram_vectorizer.transform(x_train)
TDM1=pd.DataFrame(X_TRAIN.toarray(),columns=ColumnNames)
X_TEST = ngram_vectorizer.transform(x_test)
TDM2=pd.DataFrame(X_TEST.toarray(),columns=ColumnNames)

for a in ['poly', 'rbf','linear']:
    for c in [0.01, 1, 10,100]: 
        svm = SVC(kernel = a,C = c, gamma = 'auto')
        svm.fit(TDM1, y_train)
        print ("Kernel %s with Accuracy for C=%s:%s" 
               % (a , c ,accuracy_score(y_test, svm.predict(TDM2))))

nb_clf= MultinomialNB()
scores = cross_val_score(nb_clf, ngram_vectorizer.transform(tiny_df['TEXT']), tiny_df['LABEL'], cv=10)
avg=sum(scores)/len(scores)
print(avg)



Kernel poly with Accuracy for C=0.01:0.8918387413962635
Kernel poly with Accuracy for C=1:0.8918387413962635
Kernel poly with Accuracy for C=10:0.8918387413962635
Kernel poly with Accuracy for C=100:0.8918387413962635
Kernel rbf with Accuracy for C=0.01:0.8918387413962635
Kernel rbf with Accuracy for C=1:0.8918387413962635
Kernel rbf with Accuracy for C=10:0.8918387413962635
Kernel rbf with Accuracy for C=100:0.9164208456243854
Kernel linear with Accuracy for C=0.01:0.8918387413962635
Kernel linear with Accuracy for C=1:0.9311701081612586
Kernel linear with Accuracy for C=10:0.9026548672566371
Kernel linear with Accuracy for C=100:0.8859390363815143
0.916624331201515


In [18]:
ngram_vectorizer = CountVectorizer(binary=True, stop_words = "english",ngram_range=(1, 2), max_features = 700)
ngram_vectorizer.fit(tiny_df['TEXT'])
x_train, x_test, y_train, y_test = train_test_split(tiny_df['TEXT'], tiny_df['LABEL'], train_size = 0.65,random_state = 0)
ColumnNames = ngram_vectorizer.get_feature_names()
X_TRAIN = ngram_vectorizer.transform(x_train)
TDM1=pd.DataFrame(X_TRAIN.toarray(),columns=ColumnNames)
X_TEST = ngram_vectorizer.transform(x_test)
TDM2=pd.DataFrame(X_TEST.toarray(),columns=ColumnNames)

for a in ['poly', 'rbf','linear']:
    for c in [0.01, 0.05, 0.25, 0.5, 1]: 
        svm = SVC(kernel = a,C = c, gamma = 'auto')
        svm.fit(TDM1, y_train)
        print ("Kernel %s with Accuracy for C=%s:%s" 
               % (a , c ,accuracy_score(y_test, svm.predict(TDM2))))

nb_clf= MultinomialNB()
scores = cross_val_score(nb_clf, ngram_vectorizer.transform(tiny_df['TEXT']), tiny_df['LABEL'], cv=10)
avg=sum(scores)/len(scores)
print(avg)



Kernel poly with Accuracy for C=0.01:0.8918387413962635
Kernel poly with Accuracy for C=0.05:0.8918387413962635
Kernel poly with Accuracy for C=0.25:0.8918387413962635
Kernel poly with Accuracy for C=0.5:0.8918387413962635
Kernel poly with Accuracy for C=1:0.8918387413962635
Kernel rbf with Accuracy for C=0.01:0.8918387413962635
Kernel rbf with Accuracy for C=0.05:0.8918387413962635
Kernel rbf with Accuracy for C=0.25:0.8918387413962635
Kernel rbf with Accuracy for C=0.5:0.8918387413962635
Kernel rbf with Accuracy for C=1:0.9006882989183874
Kernel linear with Accuracy for C=0.01:0.9026548672566371
Kernel linear with Accuracy for C=0.05:0.9203539823008849
Kernel linear with Accuracy for C=0.25:0.9262536873156342
Kernel linear with Accuracy for C=0.5:0.9272369714847591
Kernel linear with Accuracy for C=1:0.9233038348082596
0.8914352134599293


## Save Data

In [26]:
df.to_csv('hw7data.csv')

In [27]:
tiny_df.to_csv('hw7datap2.csv')