# Imports and configs

In [None]:
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans

import pandas as pd
import numpy as np

import plotly.express as px
import matplotlib.pyplot as plt

# Class that i made which normalizes texts
from text_normalization import TextNormalization

random_state = 0

# Load Data

In [None]:
# Load data
df = pd.read_parquet('wine_reviews.parq')

# Filter data

In [None]:
def valid_description(x):
    # Some descriptions are only 'Imported by Someone', thouse descriptions are invalid
    if 'imported by' in x.lower():
        return False
    # Short descriptions are also cutted of the recommended section
    elif len(x)<50:
        return False
    else:
        return True

In [None]:
# Emininating short descriptions and the invalid ones
df['valid_description'] = df.description.apply(lambda x: valid_description(x))
print('Eliminating {} invalid descriptions'.format(len(df[~df['valid_description']])))
df = df[df['valid_description']]

In [None]:
# Normalizes the text
text_normalization = TextNormalization()
df['text'] = df['description'].apply(lambda x: text_normalization.text_normalization(x))

In [None]:
# Adding variety to text if that exists
df['text'] = df.apply(lambda x: x.text + ' ' + x.variety if x.variety is not None else x.text, axis=1)

In [None]:
df = df.reset_index(drop=True)

# Tokenizing the text

In [None]:
len(df)

In [None]:
# parameters
MAX_FEATURES = 1000

In [None]:
tf = TfidfVectorizer(max_features=MAX_FEATURES)
dtm = tf.fit_transform(df['text'].values.astype('U'))
dtm = pd.DataFrame(dtm.todense(), columns=tf.get_feature_names())

df_with_features = dtm.merge(df, left_index=True, right_index=True)

In [None]:
len(df), len(dtm), len(df_with_features)

## Selecting the best features(words)  to train 
We are considering 3 features:
* Number of different important variety that this word appears. (A variety is important if it has more than 5 titles)
* Number of different titles where this word appears

We believe that if this word appers in different kinds of wind it is not able to distinguish well the titles.

In [None]:
from tqdm.notebook import tqdm

dict_column_n_variety = {}
for column in tqdm(sorted(dtm.columns)):
    try:
        mask = (df_with_features[column]>0)
        n_important_variety = len(
            [i for i in df_with_features[mask].variety.value_counts() if i>5]
        )
        n_titles = df_with_features[mask].title.nunique()
#         sum_column = int(df_with_features[column].sum())
        dict_column_n_variety[column] = {
            'n_important_variety': n_important_variety,
            'n_titles': n_titles,
#             'sum_column':sum_column
        }
    except Exception as exp:
        print(exp)

## Scaling features

In [None]:
feature_columns = ['n_important_variety', 'n_titles']

df_features_variety = pd.DataFrame(dict_column_n_variety).T

X = df_features_variety.values
words_labels = list(df_features_variety.index)
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

df_normalized = pd.DataFrame(X_normalized)
df_normalized.columns = feature_columns
df_normalized['word'] = words_labels

In [None]:
px.scatter(df_normalized, x='n_important_variety',y='n_titles')

## Clustering the words by those features

In [None]:
kmeans = KMeans(n_clusters=3, random_state=0).fit(df_normalized[feature_columns])
df_normalized['label'] = kmeans.labels_

In [None]:
px.scatter(df_normalized, x='n_important_variety',y='n_titles', color='label')

In [None]:
df_normalized.label.value_counts()

# Filtering the words that are probably less able to distinguish the wines

In [None]:
words_to_eliminate = sorted(df_normalized[df_normalized['label']!=0].word.unique())
words_to_keep = sorted(df_normalized[df_normalized['label']==0].word.unique())

In [None]:
dtm_filtered = dtm.drop(columns=words_to_eliminate)

In [None]:
len(dtm_filtered), len(df), len(dtm), len(df_with_features)

# Training the recommedation system

In [None]:
nn = NearestNeighbors(
    n_neighbors=4, 
    algorithm='ball_tree',
    n_jobs=-1
)
nn.fit(dtm_filtered)

In [None]:
index_recommended = nn.kneighbors(dtm_filtered.values, return_distance=False)

In [None]:
dict_recommendation = {}

for original_index, list_of_index in tqdm(enumerate(index_recommended)):
    text_list = []
    try:
        for index in list_of_index:
            text_list.append(df.loc[index].title)
        title = df.loc[original_index]['title']
        dict_recommendation[title] = list(set(text_list)- set([title]))
    except Exception as exp:
        print(str(exp))

In [None]:
import json

In [None]:

with open('titles_recommended.json', 'w') as fp:
    json.dump(dict_recommendation,fp)

In [None]:
new_list = []
for title, list_titles in dict_recommendation.items():
    for i in list_titles:
        new_list.append({
            'title': title,
            'recommended': i
        })
df_recommend = pd.DataFrame(new_list)
df_recommend.to_parquet('titles_recommended.parq')