In [1]:
import math
import numpy as np
import pandas as pd
import pandas_profiling
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

ModuleNotFoundError: No module named 'pandas_profiling'

In [None]:
url = 'https://twoldem-ml.s3.us-east-2.amazonaws.com/beer_reviews.csv'

df = pd.read_csv(url)

df.head()

In [None]:
df[ 'review_time' ] = pd.to_datetime( df[ 'review_time' ], unit = 's' )
df.tail()

# **Data Exploration**

In [None]:
df.dtypes

In [None]:
for i in df.columns[df.dtypes == object].values:
    print(i, df[i].unique())

In [None]:
# df.info(null_counts = True)
df.info()

In [None]:
df.columns

In [None]:
data_count = df.describe().transpose()
data_count

In [None]:
# df.drop(columns=['brewery_id', "beer_beerid"], inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.columns = df.columns.str.strip()

In [None]:
# df.title.value_counts()  #checking to see if we need to encode title variables


In [None]:
print( 'Unique breweries' )
print( 'By id:', df[ 'brewery_id' ].nunique() )
print( 'By name:', df[ 'brewery_name' ].nunique() )

In [None]:
print( 'Unique beers' )
print( 'By id:', df[ 'beer_beerid' ].nunique() )
print( 'By name:', df[ 'beer_name' ].nunique() )

In [None]:
df[ 'review_profilename' ].value_counts().head()

In [None]:
df.shape

In [None]:
df.value_counts('review_overall')
# drop 0, round them up 

# **Linear Regression Model**

In [None]:
linear_model = LinearRegression( normalize = True )

In [None]:
X = df[[ 'review_aroma', 'review_appearance', 'review_palate', 'review_taste' ] ]
y = df[ 'review_overall' ]
linear_model.fit(X,y)

In [None]:
# training the model 

preds = linear_model.predict(df[ [ 'review_aroma', 'review_appearance', 'review_palate', 'review_taste' ] ] )

In [None]:
# Coeffifients for each feature in X 
linear_model.coef_

In [None]:
# checking error in the model 
# Based on a rule of thumb, it can be said that RMSE values between 0.2 and 0.5 shows that the model can relatively predict the data accurately.
np.sqrt( mean_squared_error(df[ 'review_overall' ], preds ) )

In [None]:
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X,y,
#                                                     test_size=0.33, 
#                                                     random_state=42)
# X_test

# **Chelsea tries logistic regression**

In [None]:
#making two bins, "excellent" vs "not excellent"
cut_reviews = ["not excellent", "excellent"]
review_bins = [0, 3.5, 5.0]
df['excellent_reviews'] = pd.cut(df['review_overall'], bins=review_bins, labels= cut_reviews)
df.head()

In [None]:
from sklearn.linear_model           import LogisticRegression
import numpy as np
from sklearn                        import metrics, svm
df['excellent_reviews'].value_counts()

In [None]:
df_logistic = df.dropna()
x = df_logistic.excellent_reviews.values.reshape(-1, 1)
Y = df_logistic.beer_abv.values.reshape(-1, 1)


print("Shape: ", x.shape, Y.shape)


In [None]:
x = df_logistic['beer_abv']
Y = df_logistic['excellent_reviews']
Y = df_logistic['excellent_reviews'].values.reshape(-1, 1)
x = df_logistic['beer_abv'].values.reshape(-1, 1)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
x_train, x_test, Y_train, Y_test = train_test_split(x, Y, test_size=0.33, random_state=42)
logreg = LogisticRegression(multi_class='multinomial')
logreg.fit(x_train, Y_train)

In [None]:
Y_pred = logreg.predict(x_test)
#print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(x_test, Y_test)))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, Y_pred)
print(confusion_matrix)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_pred))

In [None]:

print("Training set score: {:.3f}".format(logreg.score(x_train, Y_train))) 
print("Test set score: {:.3f}".format(logreg.score(x_test, Y_test)))

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(Y_test, logreg.predict(x_test))
fpr, tpr, thresholds = roc_curve(Y_test, logreg.predict_proba(x_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [None]:
predictions = linear_model.predict(x)
# Plot Residuals
plt.scatter(predictions, predictions - y)
plt.hlines(y=0, xmin=predictions.min(), xmax=predictions.max())
plt.show()

The receiver operating characteristic (ROC) curve is another common tool used with binary classifiers. The dotted line represents the ROC curve of a purely random classifier; a good classifier stays as far away from that line as possible (toward the top-left corner).

In [None]:
# pd.crosstab(df_logistic.excellent_reviews,df_logistic.beer_abv).plot(kind='bar')
# plt.title('Excellent Beer Ratings by ABV')
# plt.xlabel('Excellent Category')
# plt.ylabel('ABV')


In [None]:
from sklearn.model_selection import learning_curve, GridSearchCV
from sklearn.linear_model           import LogisticRegression
#### GRIDSEARCH #### 
#Hypertuning parameters
logreg()

In [None]:
param_grid = {'max_iter': [100, 1000, 2000], 'solver': ['newton-cg','sag', 'saga'], 'C': [0.001, 0.01, 0.1], 'penalty': ['l2','l1']}
grid = GridSearchCV(LogisticRegression(),param_grid,refit=True,verbose=0)
grid.fit( x_train, Y_train)
print(grid.best_params_)

In [None]:
# logmodel = LogisticRegression('C': 0.001, 'max_iter': 100, 'penalty': 'l2', 'solver': 'newton-cg')
logmodel = LogisticRegression(grid.best_params_)
# next .fit, then 
logreg.fit(x_train, Y_train)

In [None]:
Y_pred = logreg.predict(x_test)

In [None]:
mse1 = mean_squared_error(Y_test, Y_pred)
rmse1 = np.sqrt(mse1)
rmse1

# **Graphs**

In [None]:
import plotly.express as px

fig = px.parallel_coordinates(df, color="review_overall", 
          labels=dict(zip(list(df.columns), 
          list(['_'.join(i.split('_')[1:]) for i in df.columns]))),
          color_continuous_scale=px.colors.diverging.Tealrose,
          color_continuous_midpoint=27)

fig.show()

# **Other Models to test?**

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression

import pandas as pd

In [None]:
predictions = linear_model.predict(X)
# Plot Residuals
plt.scatter(predictions, predictions - y)
plt.hlines(y=0, xmin=predictions.min(), xmax=predictions.max())
plt.show()

In [None]:
import numpy as np
from sklearn                        import metrics, svm
# from sklearn.linear_model           import LinearRegression
from sklearn.linear_model           import LogisticRegression
from sklearn.tree                   import DecisionTreeClassifier
from sklearn.neighbors              import KNeighborsClassifier
from sklearn.discriminant_analysis  import LinearDiscriminantAnalysis
from sklearn.naive_bayes            import GaussianNB
from sklearn.svm                    import SVC

In [None]:
model = LogisticRegression(solver='liblinear', random_state=0)

In [None]:
# clf = LinearRegression()
# clf.fit(trainingData, trainingScores)
# print("LinearRegression")
# print(clf.predict(predictionData))

clf = svm.SVR()
clf.fit(trainingData, trainingScores)
print("SVR")
print(clf.predict(predictionData))

clf = LogisticRegression()
clf.fit(trainingData, trainingScores)
print("LogisticRegression")
print(clf.predict(predictionData))

clf = DecisionTreeClassifier()
clf.fit(trainingData, trainingScores)
print("DecisionTreeClassifier")
print(clf.predict(predictionData))

clf = KNeighborsClassifier()
clf.fit(trainingData, trainingScores)
print("KNeighborsClassifier")
print(clf.predict(predictionData))

clf = LinearDiscriminantAnalysis()
clf.fit(trainingData, trainingScores)
print("LinearDiscriminantAnalysis")
print(clf.predict(predictionData))

clf = GaussianNB()
clf.fit(trainingData, trainingScores)
print("GaussianNB")
print(clf.predict(predictionData))

clf = SVC()
clf.fit(trainingData, trainingScores)
print("SVC")
print(clf.predict(predictionData))

In [None]:
#katietest

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
style.use("ggplot")
from sklearn import svm