In [261]:
import pandas as pd
import numpy as np 
import string 
import math

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk.corpus import subjectivity
from nltk.classify import NaiveBayesClassifier

from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

from beer_data_utils import get_user_dummies
from sklearn.feature_extraction.text import TfidfVectorizer 

import matplotlib.pyplot as plt


%matplotlib inline

In [218]:
train = pd.read_csv("/data/beer/train.csv")

test = pd.read_csv("/data/beer/test.csv")

In [219]:
train.head()

Unnamed: 0,index,beer/ABV,beer/beerId,beer/brewerId,beer/name,beer/style,review/appearance,review/aroma,review/overall,review/palate,review/taste,review/text,review/timeStruct,review/timeUnix,user/ageInSeconds,user/birthdayRaw,user/birthdayUnix,user/gender,user/profileName
0,40163,5.0,46634,14338,Chiostro,Herbed / Spiced Beer,4.0,4.0,4.0,4.0,4.0,Pours a clouded gold with a thin white head. N...,"{'min': 38, 'hour': 3, 'mday': 16, 'sec': 10, ...",1229398690,,,,,RblWthACoz
1,8135,11.0,3003,395,Bearded Pat's Barleywine,American Barleywine,4.0,3.5,3.5,3.5,3.0,12oz bottle into 8oz snifter.\t\tDeep ruby red...,"{'min': 38, 'hour': 23, 'mday': 8, 'sec': 58, ...",1218238738,,,,,BeerSox
2,10529,4.7,961,365,Naughty Nellie's Ale,American Pale Ale (APA),3.5,4.0,3.5,3.5,3.5,First enjoyed at the brewpub about 2 years ago...,"{'min': 7, 'hour': 18, 'mday': 26, 'sec': 2, '...",1101492422,,,,Male,mschofield
3,44610,4.4,429,1,Pilsner Urquell,Czech Pilsener,3.0,3.0,2.5,3.0,3.0,First thing I noticed after pouring from green...,"{'min': 7, 'hour': 1, 'mday': 20, 'sec': 5, 'y...",1308532025,1209827000.0,"Aug 10, 1976",208508400.0,Male,molegar76
4,37062,4.4,4904,1417,Black Sheep Ale (Special),English Pale Ale,4.0,3.0,3.0,3.5,2.5,A: pours an amber with a one finger head but o...,"{'min': 51, 'hour': 6, 'mday': 12, 'sec': 48, ...",1299912708,,,,,Brewbro000


In [220]:
y_columns = ['review/appearance', 'review/aroma', 'review/overall', 'review/palate', 'review/taste']

x_columns = [] 

In [221]:
def remove_small_words(text):
    leng = 4
    new_txt = ''
    for word in str.split(text):
        word = word.lower().lstrip(string.punctuation).strip().rstrip(string.punctuation).replace("’s", '').replace("“", '').replace("”", '').replace("’", '').replace("'s", '').replace('...', '').replace('"', '')
        if len(word) > leng:
            if len(new_txt) == 0:
                new_txt = new_txt + str(word)
            else:
                new_txt = new_txt + " " + str(word)
    return new_txt 

In [222]:
def tf_idf(df,
           col_to_vectorize,
           top_n_words_to_keep = 250):
    """
    DESCRIPTION:
        * Given a pandas DataFrame and a specified columns, creates TF-IDF columns.
        
    PARAMS:
        * df                   --> Pandas DataFrame containing variable to TF-IDF vectorize.
        * col_to_vectorize     --> String specifying column to vectorize.
        * top_n_words_to_keep  --> Max number of columns that will be added to the DataFrame. If = 100, will keep the 100 most common words.
        
    RETURNS:
        Pandas DataFrame with TF-IDF vectors.
    """
    
    # Remove small words ( < len(5) ) from the reviews
    df[col_to_vectorize] = df[col_to_vectorize].fillna(value = 'Unknown').apply(remove_small_words)
    
    # Fit the TF-IDF vectorizer
    tfidf_model = TfidfVectorizer(stop_words="english", max_features = top_n_words_to_keep)
    words = tfidf_model.fit_transform(df[col_to_vectorize])
    
    # Convert tfidf vectors to a DataFrame
    words_df = pd.DataFrame(words.toarray(), columns=tfidf_model.get_feature_names())
    
    words_df['index'] = df['index']
    
    # Creates a list of the most [top_n_words_to_keep] common words
    #cols = (words_df > 0).sum().sort_values(ascending = False)[:top_n_words_to_keep].index.values.tolist()
    
    # Combine our original DataFrame (df) and our TF-IDF model
    #words = words_df[cols]
    #words = words.reset_index()
    #df = df.reset_index()
    
    return words_df

In [223]:
def lexical(s):
    tokens = s.split(" ")
    return len(set(tokens)) / len(tokens) 

In [224]:
def transform_x(df):
    sentences = df[['review/text', 'index']]
    
    df['review/text'] = df['review/text'].apply(str)
    
    #df['review_len'] = df['review/text'].apply(len)
    #df['lexi'] = df['review/text'].apply(lexical)
    
    sid = SentimentIntensityAnalyzer()
    sentiment = []
    for sentence, index in sentences.values:
        sentence = str(sentence)    
        ss = sid.polarity_scores(sentence)
        sentiment.append([index] + list(ss.values()))
    sentframe = pd.DataFrame()
    sentframe = sentframe.append(sentiment)
    sentframe.columns=["index", "compound", "neg", "neu", "pos"]    
    style_dummies = pd.get_dummies(df[['index','beer/style']])
    name_dummies = pd.get_dummies(df[['index','beer/name']])
    user_dummies = get_user_dummies(df)
    udummies = list(user_dummies.columns.drop(["index"]))
    udummies.pop()
    uname_dummies = list(name_dummies.columns.drop(["index"]))
    uname_dummies.pop()
    
    #TODO add brewer ID 
    df['beer/brewerId'] = df['beer/brewerId'].apply(str)
    brewer_dummies = pd.get_dummies(df[['index','beer/brewerId']])
    bd = list(brewer_dummies.columns.drop(["index"]))
    bd.pop()
    
    tfidf = tf_idf(df, 'review/text')
    df = pd.merge(df, tfidf)
    df = pd.merge(df, brewer_dummies)
    df = pd.merge(df, user_dummies)
    df = pd.merge(df, style_dummies)
    df = pd.merge(df, name_dummies)
    df = pd.merge(df, sentframe)
    
    #abv = []
    #for i in range(2, 4):
    #    abv.append('beer/ABV**' + str(i))
    #    df['beer/ABV**' + str(i)] = df['beer/ABV']**i
    x_columns = ['beer/ABV'] + list(style_dummies.columns.drop(["index", "beer/style_American IPA"]))\
        + uname_dummies + bd \
        + list(sentframe.columns.drop("index")) + udummies + list(tfidf.columns.drop('index'))
    
    return df, x_columns


In [225]:
train, x_columns = transform_x(train)


In [226]:
test, x_col = transform_x(test)

In [227]:
x_columns_to_model = list(set(x_col).intersection(x_columns)) + ['ones']
#todo 
train['ones'] = 1
test['ones'] = 1

In [228]:
def lassoChoose(y_col, x_cols):
    y = train[y_col]
    #x_cols.remove('review_len')
    #x_cols.remove('lexi')
    X = train[x_cols]
    
    #std_X = StandardScaler().fit_transform(X)
    #std_m = LassoCV(cv=10)
    regular_m = LassoCV(cv =10)

    return regular_m.fit(X, y)

In [229]:
# remove level_0 


In [230]:
models = dict()
for y_c in y_columns: 
    models[y_c] = lassoChoose(y_c, x_columns_to_model)

In [326]:
predictions = test[['index']]
for y_c in y_columns: 
    predictions[y_c] = models[y_c].predict(test[x_columns_to_model])
    predictions[y_c] = predictions[y_c].apply(limit)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [318]:
#predictions.head()


In [313]:
#old_pred = pd.read_csv("predictions.csv")

vals = dict()
plt.style.use('ggplot')
for y_c in y_columns:
    c = models[y_c].coef_
    print(y_c)
    total = sum(np.fabs(c) > 0)
    print(total)
    columns = zip(x_columns_to_model, c)
    def get_key(k):
        return k[1]
    s = sorted(columns, key = get_key )
    
    #print([t[0] for t in s[0:10]])
    #print(s[0:10])
    #print([t[0] for t in s])
    #[t2[1] for t2 in ['beer/name' in t[0] for t in s[0:sum(np.fabs(c) > 0)]]]
    n = sum(['beer/name' in t[0] for t in s[0:sum(np.fabs(c) > 0)]])
    st = sum(['beer/style' in t[0] for t in s[0:sum(np.fabs(c) > 0)]])
    b = sum(['beer/brew' in t[0] for t in s[0:sum(np.fabs(c) > 0)]])
    u = sum(['user/' in t[0] for t in s[0:sum(np.fabs(c) > 0)]])
    ts = sum(['/' not in t[0] for t in s[0:sum(np.fabs(c) > 0)]])
    
    vals[y_c] = [n, st, b, u, ts]
    
    
    
    print(str(n+ st + b + u + ts))

N = 5
ind = np.arange(N)  # the x locations for the groups
width = 0.175       # the width of the bars

fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(111)

i = 0 
cs = ['xkcd:pinkish red', 'xkcd:grass green', 'xkcd:purple blue', 'xkcd:brownish orange', 'xkcd:light blue']
rects = []
for y_c in y_columns:
    rects.append(ax.bar(ind+width*i, vals[y_c], width, color=cs[i]))
    i +=1


ax.set_ylabel('Count')
ax.set_xticks(ind+width)
ax.set_xticklabels( ['beer name', 'style', 'brewer', 'user', 'tf/sent'], fontsize=20 )
ax.legend( rects, y_columns, prop={'size': 20} )

#plt.show() 
plt.savefig("interp")

len(x_columns_to_model)

review/appearance
173


TypeError: 'bool' object is not subscriptable

In [270]:
old_pred.head()

NameError: name 'old_pred' is not defined

In [327]:
#predictions.to_csv("predictions.csv", index=False)

In [325]:
#cross val testing 

train_preds = train[['index']]
for y_c in y_columns: 
    train_preds[y_c] = models[y_c].predict(train[x_columns_to_model])
    train_preds[y_c] = train_preds[y_c].apply(limit)
    print(mean_squared_error(train[y_c], train_preds[y_c]))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0.219175719234
0.253276009118
0.295341375849
0.25241227785
0.263247444896


In [324]:
def limit(n):
    if n < 1:
        return 1
    if n > 5:
        return 5
    return n

In [190]:
preds_lin = train[['index']]
X = train[x_columns_to_model]

predictions = test[['index']]

for y_c in y_columns: 
    model = LinearRegression()
    model.fit(X, train[y_c])
    preds_lin[y_c] = model.predict(train[x_columns_to_model])
    
    preds_lin[y_c] = preds_lin[y_c].apply(limit)
    
    predictions[y_c] = models[y_c].predict(test[x_columns_to_model])
    predictions[y_c] = predictions[y_c].apply(limit)
    
    print(mean_squared_error(train[y_c], preds_lin[y_c]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentat

0.222053333333
0.246666666667
0.296606666667
0.24888
0.253533333333


In [116]:
predictions.head()

Unnamed: 0,index,review/appearance,review/aroma,review/overall,review/palate,review/taste
0,13803,3.788732,3.698851,3.829343,3.702836,3.760472
1,13960,3.765226,3.650134,3.81162,3.651759,3.69963
2,26737,3.930354,3.900961,3.931108,3.864758,3.923045
3,5747,3.78354,3.694043,3.832977,3.701048,3.760302
4,4074,3.92931,3.924221,3.939706,3.90304,3.976721


In [232]:
weird limit 

0.237973333333
0.273946666667
0.315533333333
0.27248
0.283613333333

lin reg 
0.20039268168
0.224626640786
0.274483310488
0.227461018152
0.23390445181


0.250492722025
0.290979837809
0.356076108992
0.300332677921
0.316383172953


0.307884630126
0.366817717505
0.424387147487
0.360584269548
0.397666580851


10.9406435636
6.44454530485
12.9358954616
8.1349888453
7.7589664513

SyntaxError: invalid syntax (<ipython-input-232-204249a261d2>, line 1)