## Intro to recommender systems
We continue the previous lecture & explain how to put everything together to build a data pipeline!


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score, make_scorer, recall_score, precision_score, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression 
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC 
from sklearn.feature_extraction.text import CountVectorizer

### Data Pipeline

How can a model predict future data?

Previously we have followed these steps:
1) Divide data into train and test data
2) Scaling: standardization, normalization
3) Dimensionality Reduction: PCA, Random Forest
4) Learning algorithm: SVM, k-NN, Decision tree...
5) Predictive model: predict class labels

Now we will only have:
* pipeline.fit -> pipeline.predict
* *Gets raw data as input and returns valuable insight as output*

#### Cancer data

In [None]:
cancer_df = pd.read_csv('Data/cancer_data.csv')
cancer_df.head()

In [None]:
cancer_df.info()

In [None]:
# Transforming the class labels from their string (M B) into integers (1 0)

# from sklearn.preprocessing import LabelEncoder

X = cancer_df.iloc[:,2:].values
y = cancer_df.iloc[:,1].values

le = LabelEncoder()
y = le.fit_transform(y)
y_enc = le.transform(['M', 'B'])

print('[M B] labels ->', y_enc)

In [None]:
# Performing Train - test split (80/20)

# from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# # Pre-processing 
# from sklearn.preprocessing import StandardScaler
# # Dimension reduction
# from sklearn.decomposition import PCA
# # Classification
# from sklearn.linear_model import LogisticRegression 
# from sklearn.pipeline import Pipeline

In [None]:
pipe_lr = Pipeline([('scl', StandardScaler()), 
                    ('pca', PCA(n_components=2)),
                    ('clf', LogisticRegression())])

pipe_lr.fit(X_train, y_train)
y_pred = pipe_lr.predict(X_test)


In [None]:
# from sklearn.model_selection import cross_val_score

scores = cross_val_score(estimator=pipe_lr,
                         X=X_train, 
                         y=y_train,
                         cv=5,
                         n_jobs=1)

print('CV accuracy scores: %s' % scores)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

In [None]:
# from sklearn.metrics import f1_score, make_scorer, recall_score, precision_score

print('Accuracy: %.3f' % pipe_lr.score(X_test, y_test))
print('Precision: %.3f' % precision_score(y_test, y_pred))
print('Recall: %.3f' % recall_score(y_test, y_pred))
print('F1: %.3f' % f1_score(y_test, y_pred))

In [None]:
# from sklearn.metrics import confusion_matrix

confmat = confusion_matrix(y_test, y_pred)
print('Confusion matrix:\n', confmat)

# [[TP][FN]
#  [FP][FP]]

In [None]:
# Plotting confusion matrix

fig, ax = plt.subplots(figsize=(3,3))

ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.5)

for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j, y=i, s=confmat[i,j],
                va = 'center', 
                ha = 'center')

plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.show()


#### Quiz: Non-linear data 

In [None]:
# Read the new circular data
circle_df = pd.read_csv('Data/circle_data_v2.csv')
circle_df.head()

In [None]:
# Transforming class labels [-1 1] -> [0 1]

# from sklearn.preprocessing import LabelEncoder

X = circle_df.loc[:,'x0':'x1'].values
y = circle_df.loc[:,'y'].values
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
# Perform train-test split (70/30)

# from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [None]:
# # Pre-processing:
# from sklearn.preprocessing import StandardScaler
# # Dimension reduction:
# from sklearn.decomposition import PCA
# # Classification:
# from sklearn.svm import SVC 
# from sklearn.pipeline import Pipeline

In [None]:
pipe_svc = Pipeline([('scl', StandardScaler()), 
                    ('pca', PCA(n_components=2)),
                    ('clf', SVC())])

pipe_svc.fit(X_train, y_train)
y_pred = pipe_svc.predict(X_test)

In [None]:
# Performing 5-fold Cross validation 

# from sklearn.model_selection import cross_val_score

scores = cross_val_score(estimator=pipe_svc, 
                         X=X_train, 
                         y=y_train,
                         cv=5, 
                         n_jobs=1)

print('CV accuracy scores: %s' % scores)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

In [None]:
# from sklearn.metrics import f1_score, make_scorer, recall_score, precision_score

print('Accuracy: %.3f' % pipe_svc.score(X_test, y_test))
print('Precision: %.3f' % precision_score(y_test, y_pred))
print('Recall: %.3f' % recall_score(y_test, y_pred))
print('F1: %.3f' % f1_score(y_test, y_pred))

In [None]:
confmat = confusion_matrix(y_test, y_pred)
print('Confusion matrix:\n', confmat)

# Plotting confusion matrix

fig, ax = plt.subplots(figsize=(3,3))

ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.5)

for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j, y=i, s=confmat[i,j],
                va = 'center', 
                ha = 'center')

plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.show()


### Recommender Systems 

Data -> [Predictive Model] -> Interface

**Decision making - Aspect model**
How do we make choices in life?
* A - Attributes
* S - Social Influence
* P - Policies
* E - Experience
* C - Consequences
* T - Trial and error

**Types of Recommender Systems**
* Content-based filtering (CBF)
* Collaborative Filtering (CF)
* Hybrid (CBF+CF)

In [None]:
# Load the data 
genre_df = pd.read_csv('Data/movies-genres.csv')
genre_df.head()

In [None]:
# Creating moveie vector based on their genre

# from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
count_matrix = count.fit_transform(genre_df.loc[:,'genres'])

In [None]:
count_array = count_matrix.toarray()
print(count_array[0:5,:])

In [None]:
# Building similarity matrix for all movies

# from sklearn.metrics.pairwise import cosine_similarity

sim_matrix = cosine_similarity(count_matrix, count_matrix)
sim_matrix[0:5,0:5]

#### Building a simple recommender function 

In [None]:
def simple_recommender(data_frame, movie_id, sim_matrix):
    # Similarity of all movies
    sim_df = pd.DataFrame(sim_matrix[movie_id], 
                          columns = ['similarity'])
    # Building a movie_rec
    # a dataframe with <title>, <similarity> columns
    movie_titles = data_frame.loc[:,'title']
    movie_rec = pd.concat([sim_df, movie_titles], axis = 1)
    
    # Sorting movie_rec according to genre similarity 
    movie_rec=movie_rec.sort_values(by = ['similarity'], ascending=False)

    # Top 10 recommendation based on genre similarity
    return movie_rec.iloc[1:10,:]


In [None]:
# Testing the function 
movie_id = 8 # Select the movie ID of a movie

# Enter the parameters of the function:
simple_recommender(data_frame=genre_df,
                   movie_id= movie_id,
                   sim_matrix=sim_matrix )
