In [181]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
from itertools import chain
from collections import Counter
import gensim
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.preprocessing import label_binarize
from porter import PorterStemmer

In [223]:
cleaned = pd.read_csv("../data/cleaned.csv")
images = pd.read_csv("../data/images.csv")

In [224]:
cleaned.head(5)

Unnamed: 0,rating,cleaned,bigram,tweet,created_at,replies_count,retweets_count,likes_count,img_url
0,9,new view love art presentation perspective lig...,newview viewlove loveart artpresentation prese...,New view. Love the art. Presentation. Perspect...,2021-03-17 16:24:35 EDT,3,2,62,https://pbs.twimg.com/media/EwtT4E7UUAAgt5w.jpg
1,9,st paddy’s day edition great wall color maps a...,stpaddy paddy’ ’s sday dayedition editiongreat...,Room Rater St Paddy’s Day Edition. Great wall ...,2021-03-17 15:25:51 EDT,2,2,57,https://pbs.twimg.com/media/EwtGbzTUYAAxDiu.jpg
2,10,doors dublin dingus-the stuff dreams made nice...,doorsdublin dublindingus-the dingus-thestuff s...,Doors of Dublin. The Dingus-the stuff that dre...,2021-03-17 15:19:53 EDT,7,4,123,https://pbs.twimg.com/media/EwtFESHVEAEYlzv.jpg
3,9,good tight set up love blue art flag widen sho...,goodtight tightset setup uplove loveblue bluea...,Good tight set up. Love the blue. Art. Flag. W...,2021-03-17 13:46:47 EDT,1,1,60,https://pbs.twimg.com/media/EwsvwlOU8AQ5lOY.jpg
4,0,dark never escape stank wrong about everything...,darknever neverescape escapestank stankwrong w...,Dark. Will never escape the stank. Wrong about...,2021-03-17 13:11:17 EDT,35,38,846,https://pbs.twimg.com/media/Ewsnoq5U8AMxxAg.jpg


In [225]:
cleaned.iloc[0]["tweet"]

'New view. Love the art. Presentation. Perspective. Light/lighting. Widen shot slightly. 9/10 @DavidJollyFL  https://t.co/oZJcRkTSJd'

In [226]:
cleaned.iloc[7]["cleaned"]

'book club love art lighting flowers arkansas traveler @iamsophianelson'

In [227]:
# lowest rating
low_rating = cleaned[cleaned['rating'] == 0]
low_rating.head(3) # yields row 4, 20, 28

# semi low rating
semi_low_rating = cleaned[cleaned['rating'] == 3]
semi_low_rating.head(3) # yields row 75, 135, 153

# middle rating
middle_rating = cleaned[cleaned['rating'] == 5]
middle_rating.head(3) # yields rows 169, 240, 245

#semi middle rating
semi_middle_rating = cleaned[cleaned['rating'] == 7]
semi_middle_rating.head(3) # yields rows 37, 45, 69

# semi high rating
semi_high_rating = cleaned[cleaned['rating'] == 8]
semi_high_rating.head(3) # yields rows 8, 9, 17

# highest rating
highest_rating = cleaned[cleaned['rating'] == 10]
highest_rating.head(3) # yields rows 2, 5, 7


Unnamed: 0,rating,cleaned,bigram,tweet,created_at,replies_count,retweets_count,likes_count,img_url
2,10,doors dublin dingus-the stuff dreams made nice...,doorsdublin dublindingus-the dingus-thestuff s...,Doors of Dublin. The Dingus-the stuff that dre...,2021-03-17 15:19:53 EDT,7,4,123,https://pbs.twimg.com/media/EwtFESHVEAEYlzv.jpg
5,10,st paddy’s day update @philiprucker green tie,stpaddy paddy’ ’s sday dayupdate update@ @phil...,Room Rater St Paddy’s Day Update. @PhilipRucke...,2021-03-17 12:34:31 EDT,10,12,620,https://pbs.twimg.com/media/EwsfN-yVoAAcVk8.jpg
7,10,book club love art lighting flowers arkansas t...,bookclub clublove loveart artlighting lighting...,Room Rater Book Club. Love the art. Lighting. ...,2021-03-17 11:49:52 EDT,1,4,87,https://pbs.twimg.com/media/EwsVABRVgAEJGvE.jpg


In [228]:
len(cleaned)

12619

## **ADDITIONAL PREPROCESSING:**
    * Removing user tags (starting with @)
    * Stemming each cleaned word in tweet
    * Add column for default tokens (without removing stopwords)

In [235]:
preprocessed = cleaned[['rating','cleaned', 'tweet']]

In [236]:
preprocessed.head(10)

Unnamed: 0,rating,cleaned,tweet
0,9,new view love art presentation perspective lig...,New view. Love the art. Presentation. Perspect...
1,9,st paddy’s day edition great wall color maps a...,Room Rater St Paddy’s Day Edition. Great wall ...
2,10,doors dublin dingus-the stuff dreams made nice...,Doors of Dublin. The Dingus-the stuff that dre...
3,9,good tight set up love blue art flag widen sho...,Good tight set up. Love the blue. Art. Flag. W...
4,0,dark never escape stank wrong about everything...,Dark. Will never escape the stank. Wrong about...
5,10,st paddy’s day update @philiprucker green tie,Room Rater St Paddy’s Day Update. @PhilipRucke...
6,6,sun set ups tough raise camera late christmas ...,Sun room set ups are tough. Raise camera. Late...
7,10,book club love art lighting flowers arkansas t...,Room Rater Book Club. Love the art. Lighting. ...
8,8,ducks packers big plant well composed sports s...,Ducks. Packers. Big plant. Well composed sport...
9,8,well composed set up good spacing lower camera...,Well composed set up. Good spacing. Lower came...


In [237]:
def remove_tags(tweet):
    if pd.isnull(tweet):
        return []
    #print(tweet)
    tokens = tweet.split(' ')
    tokens = list(filter(lambda x: '@' not in x, tokens))
    return tokens

def stemWords(tokens):
    result = []
    if tokens is None:
        return tokens
    for token in tokens:
        if token.isalpha():
            stem = PorterStemmer()
            stemmed_word = stem.stem(token, 0, len(token) - 1)
            result.append(stemmed_word)
        else: 
            result.append(token)

    return result

def split(tweet):
    if pd.isnull(tweet):
        return []
    tokens = tweet.split(' ')
    return tokens

def strip_ratings(tweet):
    tweet = re.sub(rating_pattern, "", tweet)
    tweet = re.sub("[Mm]inus", "", tweet)
    return tweet

def strip_links(tweet):
    tweet = re.sub("https?://t.co/\w+", "", tweet)
    return tweet

def strip_end_punctuation_and_lower(tweet):
    tweet_list = tweet.split()
    return ' '.join([i.lower().rstrip(string.punctuation) for i in tweet_list])

def default_preprocess(tweet):
    tweet = strip_ratings(tweet)
    tweet = strip_links(tweet)
    tweet = strip_end_punctuation_and_lower(tweet) 
    tokens = tweet.split(" ")
    return tokens

rating_pattern = re.compile(r"((- ?)?\d+)/10")


In [238]:
preprocessed['tags_removed'] = preprocessed['cleaned'].apply(lambda x : remove_tags(x))
preprocessed['stemmed'] = preprocessed['tags_removed'].apply(lambda x : stemWords(x))
preprocessed['cleaned'] = preprocessed['cleaned'].apply(lambda x : split(x))
preprocessed['default'] = preprocessed['tweet'].apply(lambda x : default_preprocess(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [239]:
preprocessed

Unnamed: 0,rating,cleaned,tweet,tags_removed,stemmed,default
0,9,"[new, view, love, art, presentation, perspecti...",New view. Love the art. Presentation. Perspect...,"[new, view, love, art, presentation, perspecti...","[new, view, love, art, present, perspect, ligh...","[new, view, love, the, art, presentation, pers..."
1,9,"[st, paddy’s, day, edition, great, wall, color...",Room Rater St Paddy’s Day Edition. Great wall ...,"[st, paddy’s, day, edition, great, wall, color...","[st, paddy’s, dai, edit, great, wall, color, m...","[room, rater, st, paddy’s, day, edition, great..."
2,10,"[doors, dublin, dingus-the, stuff, dreams, mad...",Doors of Dublin. The Dingus-the stuff that dre...,"[doors, dublin, dingus-the, stuff, dreams, mad...","[door, dublin, dingus-the, stuff, dream, made,...","[doors, of, dublin, the, dingus-the, stuff, th..."
3,9,"[good, tight, set, up, love, blue, art, flag, ...",Good tight set up. Love the blue. Art. Flag. W...,"[good, tight, set, up, love, blue, art, flag, ...","[good, tight, set, up, love, blue, art, flag, ...","[good, tight, set, up, love, the, blue, art, f..."
4,0,"[dark, never, escape, stank, wrong, about, eve...",Dark. Will never escape the stank. Wrong about...,"[dark, never, escape, stank, wrong, about, eve...","[dark, never, escap, stank, wrong, about, ever...","[dark, will, never, escape, the, stank, wrong,..."
...,...,...,...,...,...,...
12614,8,"[blue, works, not, just, books, plus, good, us...",The blue works. Not just books a plus. Good us...,"[blue, works, not, just, books, plus, good, us...","[blue, work, not, just, book, plu, good, us, s...","[the, blue, works, not, just, books, a, plus, ..."
12615,2,"[“i’ll, just, put, sweatpants”, skype, rooms]",This is the “I’ll just put on sweatpants” of S...,"[“i’ll, just, put, sweatpants”, skype, rooms]","[“i’ll, just, put, sweatpants”, skype, room]","[this, is, the, “i’ll, just, put, on, sweatpan..."
12616,3,"[books, too, dark, little, way, personal, style]",All books. Too dark. Little in way of personal...,"[books, too, dark, little, way, personal, style]","[book, too, dark, littl, wai, person, style]","[all, books, too, dark, little, in, way, of, p..."
12617,4,"[books, always, must, little, too, obvious, su...",Books always a must but a little too obvious f...,"[books, always, must, little, too, obvious, su...","[book, alwai, must, littl, too, obviou, such, ...","[books, always, a, must, but, a, little, too, ..."


In [241]:
preprocessed.iloc[0]["default"]

['new',
 'view',
 'love',
 'the',
 'art',
 'presentation',
 'perspective',
 'light/lighting',
 'widen',
 'shot',
 'slightly',
 '@davidjollyfl']

# **Utilizing LDA Topic Modeling**

**Only LDA Topic Model:** 
   1. Fits data into LDA to get topic models, topic model is trained
   2. Topic model is applied back to training reviews + each review gets topic
      distribution/probabilities
   3. Topic probabilities are used as features for rating prediction + fitted
      into linear regression model


## Fitting Only LDA

In [88]:
from gensim.models.ldamodel import LdaModel
from gensim.corpora import Dictionary
# preprocessed has columns: cleaned, tags_removed, stemmed, default

### Creating a Bag of Words for Cleaned + Stemmed

In [242]:
# Creating bag of words

# cleaned column
token_dict_cleaned = Dictionary(preprocessed.cleaned)
bag_of_words_cleaned = [token_dict_cleaned.doc2bow(tweet) for tweet in preprocessed['cleaned']]

# stemmed
token_dict_stemmed = Dictionary(preprocessed.stemmed)
bag_of_words_stemmed = [token_dict_stemmed.doc2bow(tweet) for tweet in preprocessed['stemmed']]

# default
token_dict_default = Dictionary(preprocessed.default)
bag_of_words_default = [token_dict_default.doc2bow(tweet) for tweet in preprocessed['default']]

### Fitting Cleaned + Stemmed data into LDA model with 5 topics

In [243]:
# Fitting data into LDA (5 TOPICS)

# cleaned
lda_5_cleaned = LdaModel(bag_of_words_cleaned, 
                    num_topics = 5,
                    id2word = token_dict_cleaned,
                    random_state = 1, 
                    passes = 10)

#print(lda_5_cleaned.show_topics())

# stemmed
lda_5_stemmed = LdaModel(bag_of_words_stemmed, 
                    num_topics = 5,
                    id2word = token_dict_stemmed,
                    random_state = 1, 
                    passes = 10)

lda_5_stemmed.show_topics()

# default
lda_5_default = LdaModel(bag_of_words_default, 
                    num_topics = 5,
                    id2word = token_dict_default,
                    random_state = 1, 
                    passes = 10)

lda_5_default.show_topics()

[(0,
  '0.021*"of" + 0.014*"books" + 0.013*"nice" + 0.013*"camera" + 0.013*"raise" + 0.010*"a" + 0.010*"and" + 0.009*"book" + 0.009*"for" + 0.009*"up"'),
 (1,
  '0.057*"the" + 0.034*"art" + 0.031*"love" + 0.022*"and" + 0.020*"a" + 0.016*"add" + 0.016*"plant" + 0.015*"on" + 0.014*"light" + 0.013*"to"'),
 (2,
  '0.037*"the" + 0.033*"a" + 0.021*"is" + 0.019*"we" + 0.015*"room" + 0.013*"for" + 0.012*"in" + 0.012*"" + 0.011*"to" + 0.011*"and"'),
 (3,
  '0.034*"a" + 0.029*"room" + 0.024*"the" + 0.022*"" + 0.018*"to" + 0.017*"and" + 0.017*"is" + 0.016*"this" + 0.016*"in" + 0.015*"rater"'),
 (4,
  '0.053*"well" + 0.026*"art" + 0.020*"lit" + 0.017*"good" + 0.014*"plant" + 0.012*"up" + 0.011*"a" + 0.011*"books" + 0.010*"set" + 0.010*"nice"')]

### Fitting Cleaned + Stemmed data into LDA model with 10 topics

In [244]:
# Fitting data into LDA (10 TOPICS)

# cleaned
lda_10_cleaned = LdaModel(bag_of_words_cleaned, 
                    num_topics = 10,
                    id2word = token_dict_cleaned,
                    random_state = 1, 
                    passes = 10)

#print(lda_5_cleaned.show_topics())

# stemmed
lda_10_stemmed = LdaModel(bag_of_words_stemmed, 
                    num_topics = 10,
                    id2word = token_dict_stemmed,
                    random_state = 1, 
                    passes = 10)

lda_10_stemmed.show_topics()

# default
lda_10_default = LdaModel(bag_of_words_default, 
                    num_topics = 10,
                    id2word = token_dict_default,
                    random_state = 1, 
                    passes = 10)

lda_10_default.show_topics()

[(0,
  '0.087*"camera" + 0.073*"raise" + 0.020*"lower" + 0.016*"nice" + 0.013*"slightly" + 0.012*"bigger" + 0.012*"beams" + 0.012*"reframe" + 0.011*"move" + 0.010*"height"'),
 (1,
  '0.069*"the" + 0.041*"art" + 0.040*"love" + 0.023*"great" + 0.019*"plant" + 0.017*"lighting" + 0.017*"depth" + 0.016*"good" + 0.016*"for" + 0.013*"flowers"'),
 (2,
  '0.054*"room" + 0.040*"a" + 0.031*"rater" + 0.023*"" + 0.020*"skype" + 0.020*"and" + 0.020*"for" + 0.018*"the" + 0.017*"rooms" + 0.017*"to"'),
 (3,
  '0.056*"a" + 0.029*"is" + 0.024*"but" + 0.024*"to" + 0.023*"you" + 0.020*"this" + 0.019*"" + 0.019*"we" + 0.019*"not" + 0.018*"the"'),
 (4,
  '0.044*"" + 0.017*"music" + 0.012*"start" + 0.012*"personal" + 0.012*"guy" + 0.011*"texture" + 0.009*"expect" + 0.009*"home" + 0.008*"perfectly" + 0.008*"hip"'),
 (5,
  '0.045*"art" + 0.034*"a" + 0.029*"well" + 0.026*"add" + 0.025*"the" + 0.022*"plant" + 0.020*"good" + 0.018*"on" + 0.018*"love" + 0.018*"nice"'),
 (6,
  '0.031*"doors" + 0.021*"deduction" + 0.

### Fitting Cleaned + Stemmed data into LDA model with 20 topics

In [93]:
# Fitting data into LDA (20 TOPICS)
lda_20 = LdaModel(bag_of_words_cleaned, 
                    num_topics = 20,
                    id2word = token_dict,
                    random_state = 1, 
                    passes = 10)

lda_20.show_topics()

[(10,
  '0.101*"love" + 0.094*"art" + 0.076*"great" + 0.036*"depth" + 0.034*"lighting" + 0.021*"plant" + 0.021*"flowers" + 0.020*"lamp" + 0.017*"chair" + 0.015*"blue"'),
 (14,
  '0.054*"see" + 0.031*"so" + 0.029*"can" + 0.028*"space" + 0.026*"like" + 0.023*"more" + 0.018*"make" + 0.018*"would" + 0.017*"art" + 0.017*"fix"'),
 (2,
  '0.090*"it’s" + 0.038*"always" + 0.035*"bad" + 0.032*"succulent" + 0.026*"way" + 0.024*"still" + 0.021*"top" + 0.020*"being" + 0.016*"dog" + 0.015*"position"'),
 (16,
  '0.080*"out" + 0.043*"not" + 0.029*"ok" + 0.029*"crop" + 0.026*"he’s" + 0.023*"that’s" + 0.022*"ceiling" + 0.018*"doesn’t" + 0.017*"music" + 0.017*"sure"'),
 (7,
  '0.049*"lovely" + 0.030*"than" + 0.023*"elegant" + 0.021*"cabinet" + 0.019*"open" + 0.019*"collection" + 0.019*"yellow" + 0.019*"nailed" + 0.018*"start" + 0.017*"update"'),
 (3,
  '0.025*"there’s" + 0.024*"1" + 0.021*"we’ll" + 0.020*"we’ve" + 0.017*"@jheil" + 0.016*"seen" + 0.015*"yet" + 0.015*"someone" + 0.014*"help" + 0.014*"best"

### For each cleaned + stemmed data, apply both 5 and 10 topic models to each tweet, output 5 topic and 10 topic probabilities for each tweet

In [245]:
def extract_topic_probabilities_cleaned(tokens, num_topics):
    bow = token_dict_cleaned.doc2bow(tokens)
    if num_topics == 10: 
        topic_probs = lda_10_cleaned.get_document_topics(bow)
    if num_topics == 5:
        topic_probs = lda_5_cleaned.get_document_topics(bow)
        
    probs = []
        
    probs_dict = dict(topic_probs)
        
    for i in range (0, num_topics):
        if i not in probs_dict: 
            probs.append(0)
        else: 
            probs.append(probs_dict[i])
    
    return probs

def extract_topic_probabilities_stemmed(tokens, num_topics):
    bow = token_dict_stemmed.doc2bow(tokens)
    if num_topics == 10: 
        topic_probs = lda_10_stemmed.get_document_topics(bow)
    if num_topics == 5:
        topic_probs = lda_5_stemmed.get_document_topics(bow)
        
    probs = []
        
    probs_dict = dict(topic_probs)
        
    for i in range (0, num_topics):
        if i not in probs_dict: 
            probs.append(0)
        else: 
            probs.append(probs_dict[i])
    
    return probs

def extract_topic_probabilities_default(tokens, num_topics):
    bow = token_dict_default.doc2bow(tokens)
    if num_topics == 10: 
        topic_probs = lda_10_default.get_document_topics(bow)
    if num_topics == 5:
        topic_probs = lda_5_default.get_document_topics(bow)
        
    probs = []
        
    probs_dict = dict(topic_probs)
        
    for i in range (0, num_topics):
        if i not in probs_dict: 
            probs.append(0)
        else: 
            probs.append(probs_dict[i])
    
    return probs
   
extract_topic_probabilities_cleaned(processed, 10)

[0.010069676,
 0.6693375,
 0.010069676,
 0.25009936,
 0.010071219,
 0.010070459,
 0.010069675,
 0.0100699365,
 0.010070947,
 0.010071544]

### Adding topic probabilities as columns to data

In [246]:
preprocessed['lda_10_probs_cleaned'] = preprocessed['cleaned'].apply(lambda x : extract_topic_probabilities_cleaned(x, 10))
preprocessed['lda_5_probs_cleaned'] = preprocessed['cleaned'].apply(lambda x : extract_topic_probabilities_cleaned(x, 5))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [247]:
preprocessed['lda_10_probs_stemmed'] = preprocessed['stemmed'].apply(lambda x : extract_topic_probabilities_stemmed(x, 10))
preprocessed['lda_5_probs_stemmed'] = preprocessed['stemmed'].apply(lambda x : extract_topic_probabilities_stemmed(x, 5))

In [248]:
preprocessed['lda_10_probs_default'] = preprocessed['default'].apply(lambda x : extract_topic_probabilities_default(x, 10))
preprocessed['lda_5_probs_default'] = preprocessed['default'].apply(lambda x : extract_topic_probabilities_default(x, 5))

In [249]:
preprocessed.head(5)

Unnamed: 0,rating,cleaned,tweet,tags_removed,stemmed,default,lda_10_probs_cleaned,lda_5_probs_cleaned,lda_10_probs_stemmed,lda_5_probs_stemmed,lda_10_probs_default,lda_5_probs_default
0,9,"[new, view, love, art, presentation, perspecti...",New view. Love the art. Presentation. Perspect...,"[new, view, love, art, presentation, perspecti...","[new, view, love, art, present, perspect, ligh...","[new, view, love, the, art, presentation, pers...","[0, 0.68522346, 0, 0.2477858, 0, 0, 0, 0, 0, 0]","[0.11110278, 0.8379478, 0.016782334, 0.0173642...","[0, 0, 0, 0.14467387, 0, 0, 0, 0, 0.7753759, 0]","[0.018320857, 0.018461967, 0.018315086, 0.9264...","[0, 0.54525924, 0, 0, 0, 0.31564614, 0.0849685...","[0.015535021, 0.9377828, 0.015555755, 0.015524..."
1,9,"[st, paddy’s, day, edition, great, wall, color...",Room Rater St Paddy’s Day Edition. Great wall ...,"[st, paddy’s, day, edition, great, wall, color...","[st, paddy’s, dai, edit, great, wall, color, m...","[room, rater, st, paddy’s, day, edition, great...","[0, 0.11533039, 0, 0.5768728, 0, 0, 0, 0.26115...","[0.013347856, 0.31579176, 0.012890519, 0.57554...","[0, 0.31566647, 0, 0, 0, 0, 0, 0, 0.6309494, 0]","[0.013394717, 0.3650373, 0.013496481, 0.594382...","[0, 0, 0, 0, 0, 0.5381587, 0, 0, 0, 0.4173673]","[0.010641494, 0.23810284, 0.010682813, 0.33756..."
2,10,"[doors, dublin, dingus-the, stuff, dreams, mad...",Doors of Dublin. The Dingus-the stuff that dre...,"[doors, dublin, dingus-the, stuff, dreams, mad...","[door, dublin, dingus-the, stuff, dream, made,...","[doors, of, dublin, the, dingus-the, stuff, th...","[0.19798075, 0.22859469, 0, 0, 0, 0, 0.1222236...","[0.75999737, 0.19633839, 0.014439249, 0.014682...","[0, 0.057735033, 0, 0.54084694, 0, 0.1300493, ...","[0.10219681, 0.19941911, 0.014534805, 0.584886...","[0, 0, 0.10568784, 0, 0, 0, 0.060056217, 0.797...","[0.6363183, 0.33111113, 0.0109798545, 0.010921..."
3,9,"[good, tight, set, up, love, blue, art, flag, ...",Good tight set up. Love the blue. Art. Flag. W...,"[good, tight, set, up, love, blue, art, flag, ...","[good, tight, set, up, love, blue, art, flag, ...","[good, tight, set, up, love, the, blue, art, f...","[0, 0, 0, 0.6056093, 0.3326137, 0, 0, 0, 0, 0]","[0.015479418, 0.015946027, 0.015447189, 0.9374...","[0, 0, 0, 0.4312952, 0, 0, 0, 0, 0.50202626, 0]","[0.016847817, 0.01667914, 0.016725928, 0.93293...","[0, 0.13276435, 0, 0, 0, 0.738581, 0.07868052,...","[0.013553711, 0.94561774, 0.013505388, 0.01350..."
4,0,"[dark, never, escape, stank, wrong, about, eve...",Dark. Will never escape the stank. Wrong about...,"[dark, never, escape, stank, wrong, about, eve...","[dark, never, escap, stank, wrong, about, ever...","[dark, will, never, escape, the, stank, wrong,...","[0, 0, 0, 0, 0, 0, 0.39992622, 0.3857506, 0.15...","[0.018459152, 0.018509388, 0.7792248, 0.018318...","[0.01018724, 0.010193234, 0.2553968, 0.3359198...","[0.38353115, 0.5552803, 0.020458056, 0.0202665...","[0, 0, 0.29659498, 0.57335514, 0, 0, 0.0801155...","[0.012659157, 0.012793749, 0.4296478, 0.53226,..."


### Creating dataframe for each number of topics and preprocessing pair, each dataframe has rating (y) and columns for each topic probability

In [250]:
# 10 TOPICS: CLEANED
lda_features_df_10_cleaned = preprocessed[['rating', 'lda_10_probs_cleaned']]
lda_features_df_10_cleaned[['0','1', '2', '3', '4', '5', '6', '7', '8', '9']] = pd.DataFrame(lda_features_df_10_cleaned.lda_10_probs_cleaned.tolist(), index=lda_features_df_10_cleaned.index)
lda_features_df_10_cleaned = lda_features_df_10_cleaned[['rating', '0','1', '2', '3', '4', '5', '6', '7', '8', '9']]
lda_features_df_10_cleaned

# 10 TOPICS: STEMMED
lda_features_df_10_stemmed = preprocessed[['rating', 'lda_10_probs_stemmed']]
lda_features_df_10_stemmed[['0','1', '2', '3', '4', '5', '6', '7', '8', '9']] = pd.DataFrame(lda_features_df_10_stemmed.lda_10_probs_stemmed.tolist(), index=lda_features_df_10_stemmed.index)
lda_features_df_10_stemmed = lda_features_df_10_stemmed[['rating', '0','1', '2', '3', '4', '5', '6', '7', '8', '9']]
lda_features_df_10_stemmed

# 10 TOPICS: DEFAULT
lda_features_df_10_default = preprocessed[['rating', 'lda_10_probs_default']]
lda_features_df_10_default[['0','1', '2', '3', '4', '5', '6', '7', '8', '9']] = pd.DataFrame(lda_features_df_10_default.lda_10_probs_default.tolist(), index=lda_features_df_10_default.index)
lda_features_df_10_default = lda_features_df_10_default[['rating', '0','1', '2', '3', '4', '5', '6', '7', '8', '9']]
lda_features_df_10_default

# 5 TOPICS: CLEANED
lda_features_df_5_cleaned = preprocessed[['rating', 'lda_5_probs_cleaned']]
lda_features_df_5_cleaned[['0','1', '2', '3', '4']] = pd.DataFrame(lda_features_df_5_cleaned.lda_5_probs_cleaned.tolist(), index=lda_features_df_5_cleaned.index)
lda_features_df_5_cleaned = lda_features_df_5_cleaned[['rating', '0','1', '2', '3', '4']]
lda_features_df_5_cleaned

# 5 TOPICS: STEMMED
lda_features_df_5_stemmed = preprocessed[['rating', 'lda_5_probs_stemmed']]
lda_features_df_5_stemmed[['0','1', '2', '3', '4']] = pd.DataFrame(lda_features_df_5_stemmed.lda_5_probs_stemmed.tolist(), index=lda_features_df_5_stemmed.index)
lda_features_df_5_stemmed = lda_features_df_5_stemmed[['rating', '0','1', '2', '3', '4']]
lda_features_df_5_stemmed

# 5 TOPICS: DEFAULT
lda_features_df_5_default = preprocessed[['rating', 'lda_5_probs_default']]
lda_features_df_5_default[['0','1', '2', '3', '4']] = pd.DataFrame(lda_features_df_5_default.lda_5_probs_default.tolist(), index=lda_features_df_5_default.index)
lda_features_df_5_default = lda_features_df_5_default[['rating', '0','1', '2', '3', '4']]
lda_features_df_5_default.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,rating,0,1,2,3,4
0,9,0.015535,0.937783,0.015556,0.015524,0.015602
1,9,0.010641,0.238103,0.010683,0.337561,0.403012
2,10,0.636318,0.331111,0.01098,0.010922,0.010669
3,9,0.013554,0.945618,0.013505,0.013503,0.01382
4,0,0.012659,0.012794,0.429648,0.53226,0.012639


### Fitting Logistic Regression Model on above dataframes, features (X) = topic probabilities, output (y) = rating

In [251]:
# (10 TOPICS)

# CLEANED

train_size = int(0.1 * len(lda_features_df_10_cleaned))
train = lda_features_df_10_cleaned[:train_size]
test = lda_features_df_10_cleaned[train_size:]

X_all = lda_features_df_10_cleaned[['0','1', '2', '3', '4', '5', '6', '7', '8', '9']]
y_all = lda_features_df_10_cleaned['rating']

X_train = train[['0','1', '2', '3', '4', '5', '6', '7', '8', '9']]
y_train = train['rating']

X_test = test[['0','1', '2', '3', '4', '5', '6', '7', '8', '9']]
y_test = test['rating']

labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

logm_10_cleaned = LogisticRegression()
logm_10_cleaned.fit(X_train, y_train)

print("CLEANED SCORES")

print("10 cleaned training acc: ", logm_10_cleaned.score(X_train, y_train))
print("10 cleaned testing acc: ", logm_10_cleaned.score(X_test, y_test))

y_pred = logm_10_cleaned.predict(X_test)
print("10 cleaned testing F1 score: ", f1_score(y_test, y_pred, average = 'macro'))

y_test_binarized = label_binarize(list(y_test), classes = labels)
y_preds_binarized = label_binarize(list(y_pred), classes = labels)
print("10 cleaned testing roc score: ", roc_auc_score(y_test_binarized, y_preds_binarized, average = 'macro', multi_class = 'ovo'))

# STEMMED

train_size = int(0.1 * len(lda_features_df_10_stemmed))
train = lda_features_df_10_stemmed[:train_size]
test = lda_features_df_10_stemmed[train_size:]

X_all = lda_features_df_10_stemmed[['0','1', '2', '3', '4', '5', '6', '7', '8', '9']]
y_all = lda_features_df_10_stemmed['rating']

X_train = train[['0','1', '2', '3', '4', '5', '6', '7', '8', '9']]
y_train = train['rating']

X_test = test[['0','1', '2', '3', '4', '5', '6', '7', '8', '9']]
y_test = test['rating']

logm_10_stemmed = LogisticRegression()
logm_10_stemmed.fit(X_train, y_train)

print("STEMMED SCORES")

print("10 stemmed training acc: ", logm_10_stemmed.score(X_train, y_train))
print("10 stemmed testing acc: ", logm_10_stemmed.score(X_test, y_test))

y_pred = logm_10_stemmed.predict(X_test)
print("10 stemmed testing F1 score: ", f1_score(y_test, y_pred, average = 'macro'))

y_test_binarized = label_binarize(list(y_test), classes = labels)
y_preds_binarized = label_binarize(list(y_pred), classes = labels)
print("10 stemmed testing roc score: ", roc_auc_score(y_test_binarized, y_preds_binarized, average = 'macro', multi_class = 'ovo'))

# DEFAULT

train_size = int(0.1 * len(lda_features_df_10_default))
train = lda_features_df_10_default[:train_size]
test = lda_features_df_10_default[train_size:]

X_all = lda_features_df_10_default[['0','1', '2', '3', '4', '5', '6', '7', '8', '9']]
y_all = lda_features_df_10_default['rating']

X_train = train[['0','1', '2', '3', '4', '5', '6', '7', '8', '9']]
y_train = train['rating']

X_test = test[['0','1', '2', '3', '4', '5', '6', '7', '8', '9']]
y_test = test['rating']

logm_10_default = LogisticRegression()
logm_10_default.fit(X_train, y_train)

print("DEFAULT SCORES")

print("10 default training acc: ", logm_10_stemmed.score(X_train, y_train))
print("10 default testing acc: ", logm_10_stemmed.score(X_test, y_test))

y_pred = logm_10_stemmed.predict(X_test)
print("10 default testing F1 score: ", f1_score(y_test, y_pred, average = 'macro'))

y_test_binarized = label_binarize(list(y_test), classes = labels)
y_preds_binarized = label_binarize(list(y_pred), classes = labels)
print("10 default testing roc score: ", roc_auc_score(y_test_binarized, y_preds_binarized, average = 'macro', multi_class = 'ovo'))

CLEANED SCORES
10 cleaned training acc:  0.4409199048374306
10 cleaned testing acc:  0.28385279098432825
10 cleaned testing F1 score:  0.029420443359904833
10 cleaned testing roc score:  0.5055003314023403
STEMMED SCORES
10 stemmed training acc:  0.44885011895321175
10 stemmed testing acc:  0.29961260785349536
10 stemmed testing F1 score:  0.03895414416761144
10 stemmed testing roc score:  0.5124778559464753
DEFAULT SCORES
10 default training acc:  0.4020618556701031
10 default testing acc:  0.2411516111991548
10 default testing F1 score:  0.019901441157023664
10 default testing roc score:  0.49485538335176105


In [252]:
# fitting logistic regression model on training data (5 TOPICS)

# CLEANED

train_size = int(0.1 * len(lda_features_df_5_cleaned))
train = lda_features_df_5_cleaned[:train_size]
test = lda_features_df_5_cleaned[train_size:]

X_all = lda_features_df_5_cleaned[['0','1', '2', '3', '4']]
y_all = lda_features_df_5_cleaned['rating']

X_train = train[['0','1', '2', '3', '4']]
y_train = train['rating']

X_test = test[['0','1', '2', '3', '4']]
y_test = test['rating']

logm_5_cleaned = LogisticRegression()
logm_5_cleaned.fit(X_train, y_train)

print("CLEANED SCORES")

print("5 cleaned training score: ", logm_5_cleaned.score(X_train, y_train))
print("5 cleaned testing score: ", logm_5_cleaned.score(X_test, y_test))

y_pred = logm_5_cleaned.predict(X_test)
print("5 cleaned testing F1 score: ", f1_score(y_test, y_pred, average = 'macro'))

y_test_binarized = label_binarize(list(y_test), classes = labels)
y_preds_binarized = label_binarize(list(y_pred), classes = labels)
print("5 cleaned testing roc score: ", roc_auc_score(y_test_binarized, y_preds_binarized, average = 'macro', multi_class = 'ovo'))

# STEMMED

train_size = int(0.1 * len(lda_features_df_5_stemmed))
train = lda_features_df_5_stemmed[:train_size]
test = lda_features_df_5_stemmed[train_size:]

X_all = lda_features_df_5_stemmed[['0','1', '2', '3', '4']]
y_all = lda_features_df_5_stemmed['rating']

X_train = train[['0','1', '2', '3', '4']]
y_train = train['rating']

X_test = test[['0','1', '2', '3', '4']]
y_test = test['rating']

logm_5_stemmed = LogisticRegression()
logm_5_stemmed.fit(X_train, y_train)

print("STEMMED SCORES")

print("5 stemmed training score: ", logm_5_stemmed.score(X_train, y_train))
print("5 stemmed testing score: ", logm_5_stemmed.score(X_test, y_test))

y_pred = logm_5_stemmed.predict(X_test)
print("5 stemmed testing F1 score: ", f1_score(y_test, y_pred, average = 'macro'))

y_test_binarized = label_binarize(list(y_test), classes = labels)
y_preds_binarized = label_binarize(list(y_pred), classes = labels)
print("5 stemmed testing roc score: ", roc_auc_score(y_test_binarized, y_preds_binarized, average = 'macro', multi_class = 'ovo'))


# DEFAULT

train_size = int(0.1 * len(lda_features_df_5_default))
train = lda_features_df_5_default[:train_size]
test = lda_features_df_5_default[train_size:]

X_all = lda_features_df_5_default[['0','1', '2', '3', '4']]
y_all = lda_features_df_5_default['rating']

X_train = train[['0','1', '2', '3', '4']]
y_train = train['rating']

X_test = test[['0','1', '2', '3', '4']]
y_test = test['rating']

logm_5_default = LogisticRegression()
logm_5_default.fit(X_train, y_train)

print("DEFAULT SCORES")

print("5 default training score: ", logm_5_default.score(X_train, y_train))
print("5 default testing score: ", logm_5_default.score(X_test, y_test))

y_pred = logm_5_default.predict(X_test)
print("5 default testing F1 score: ", f1_score(y_test, y_pred, average = 'macro'))

y_test_binarized = label_binarize(list(y_test), classes = labels)
y_preds_binarized = label_binarize(list(y_pred), classes = labels)
print("5 default testing roc score: ", roc_auc_score(y_test_binarized, y_preds_binarized, average = 'macro', multi_class = 'ovo'))

CLEANED SCORES
5 cleaned training score:  0.4274385408406027
5 cleaned testing score:  0.2900158478605388
5 cleaned testing F1 score:  0.030602484476379446
5 cleaned testing roc score:  0.5072406203665009
STEMMED SCORES
5 stemmed training score:  0.43536875495638383
5 stemmed testing score:  0.30198978693431944
5 stemmed testing F1 score:  0.033609750498231734
5 stemmed testing roc score:  0.5108904819148777
STEMMED SCORES
5 default training score:  0.4425059476605868
5 default testing score:  0.29503433703116744
5 default testing F1 score:  0.03548037647244931
5 default testing roc score:  0.5087845086358085
