<a href="https://colab.research.google.com/github/chey-to-mozg/tinkoff_hack2/blob/master/IE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Task description



In this task we try to determine user satisfaction level of the seller, based on that features:

* user_spends
* user_previous_satisfaction_of_customer(user_rating)
* product_price
* product_quality
* ie_rating
* ie_vote_count


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error

### Dataset description
We propose synthetic dataset which consist from features defined in Description section and label - target satisfaction level of User by IE. Data base consist from 3 tables:

* User(user_id, user_spends)
* IE(ie_id, product_price,product_quality,ie_rating,ie_vote_count)
* Satisfaction(user_id, ie_id, user_rating)

Labels generated under the certain law:
$$prob_{satisfaction} = \sigma(w_1 * \frac{price}{spends} + w_2 * rating_{prev} + w_3 * rating_{total} + w_4 * quality + w_5 * count_{vote} + noise)$$


In [None]:
def Generate_Table_User(num_users):
    data = {'user_id' : [i for i in range(num_users)], 'user_spends' : np.random.uniform(low=0.01, high=1.0, size=num_users)}
    df = pd.DataFrame(data)
    return df

In [None]:
def Generate_Table_IE(num_ie):
    data = {'ie_id' : [i for i in range(num_ie)], 'product_price' : np.random.uniform(low=0.01, high=0.25, size=num_ie), \
            'product_quality' : np.random.randint(1, 6, num_ie), 'ie_rating' : np.random.randint(1, 6, num_ie), \
            'ie_vote_count' : np.random.randint(1, 100, num_ie)}
    df = pd.DataFrame(data)
    return df

In [None]:
def Get_Table_Satisfaction(num_users, num_ie, num_sat):
    data = {'user_id_sat' : np.random.randint(0, num_users, num_sat), 'ie_id_sat' : np.random.randint(0, num_ie, num_sat), 'user_rating' : np.random.randint(1, 6, num_sat)}
    df = pd.DataFrame(data)
    return df

In [None]:
def Get_Tables(num_users, num_ie, num_sat):
    t_user = Generate_Table_User(num_users)
    t_ie = Generate_Table_IE(num_ie)
    t_sat = Get_Table_Satisfaction(num_users, num_ie, num_sat)
    t_user["key"], t_ie["key"], t_sat["key"] = 1, 1, 1
    merged_table = pd.merge(t_user, t_ie, on='key')
    merged_table = pd.merge(merged_table, t_sat, on='key')
    merged_table = merged_table[(merged_table.user_id == merged_table.user_id_sat) & (merged_table.ie_id == merged_table.ie_id_sat)]
    merged_table.drop(columns=['key', 'user_id_sat', 'ie_id_sat'], inplace=True)
    t_user.drop(columns=['key',], inplace=True)
    t_ie.drop(columns=['key',], inplace=True)
    t_sat.drop(columns=['key',], inplace=True)

    return t_user, t_ie, t_sat, merged_table

In [None]:
# def Generate_X(num_samples):
#     num_ie, num_users = 100, 100
#     data = {'user_id' : np.random.randint(1, num_users, num_samples), 'user_spends' : np.random.uniform(low=0.01, high=1.0, size=num_samples), 'user_rating' : np.random.randint(1, 6, num_samples), 'product_price' : np.random.uniform(low=0.01, high=0.25, size=num_samples), \
#             'product_quality' : np.random.randint(1, 6, num_samples), 'ie_rating' : np.random.randint(1, 6, num_samples), 'ie_vote_count' : np.random.randint(1, 100, num_samples), 'ie_id':np.random.randint(1, num_ie, num_samples)}
#     df = pd.DataFrame(data)
#     return df

In [None]:
def True_Dependency(features):
    w1 = 1 / 5.0
    w2 = w3 = w4 = 1.0 / 30.0
    w5  = 1.0 / 200
    level_noise = 0.05
    def noise(level):
        return np.random.normal(scale=0.1, size = 1)
    def sigmoid(x):
        return 1 /(1 + np.exp(-x))
    user_spends, user_rating, product_price, product_quality, ie_rating, ie_vote_count = features
    user_satisfaction = np.round(4 * sigmoid((product_price / user_spends) * w1 + (user_rating) * w2 + (ie_rating) * w3 + (product_quality) * w4 + (ie_vote_count) * w5 + noise(level_noise))) + 1

    return user_satisfaction

In [None]:
def Generate_Data():
    # num_samples = 100
    # X = Generate_X(num_samples)
    num_users = num_ie = 200
    num_sat = 70
    tables = Get_Tables(num_users, num_ie, num_sat)
    X, Y = tables[3], np.zeros(num_sat)
    X_trunc = X.drop(columns=['user_id', 'ie_id'])
    for i in range(num_sat):
        features = np.array(X_trunc.iloc[i])
        Y[i] = True_Dependency(features)

    return X, Y, tables

### Creation of predictive model

In [None]:
X, Y, tables = Generate_Data()
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33)
clf  = make_pipeline(StandardScaler(), LogisticRegression(C= 0.01))
clf.fit(X_train.drop(columns=['user_id', 'ie_id']), y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logisticregression',
                 LogisticRegression(C=0.01, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [None]:
y_pred = clf.predict(X_test.drop(columns=['user_id', 'ie_id']))
print("Accuracy: {}%, Mean Error: {}%".format(round(100 * accuracy_score(y_test, y_pred),2), round(100 * mean_absolute_error(y_test, y_pred),2)))

Accuracy: 91.67%, Mean Error: 8.33%


### Recomendation System

In [None]:
# Predictions on Logistic Regression model

def Predict_By_Model(table, user_id, model):
    search_space = table[table.user_id == user_id]
    search_space["scores"] = model.predict(search_space.drop(columns=['user_id','ie_id']))
    search_space.sort_values('scores', inplace=True, ascending=False)

    return search_space.ie_id, search_space.scores
    

In [None]:
# Calculation based on the IMDB formula
# https://help.imdb.com/article/imdb/track-movies-tv/ratings-faq/G67Y87TFYYP6TWAV#

def IMDB_Reccomender(df):
    C = df['ie_rating'].mean()
    m = df['ie_vote_count'].quantile(0.90)
    q_ie = df.copy(deep=True).loc[df['ie_vote_count'] >= m]

    def weighted_rating(x, m=m, C=C):
        v = x['ie_vote_count']
        R = x['ie_rating']
    
        return (v/(v+m) * R) + (m/(m+v) * C)

    q_ie['score'] = q_ie.apply(weighted_rating, axis=1)
    q_ie.sort_values('score', inplace=True, ascending=False)

    return q_ie.ie_id, q_ie.score


In [None]:
# First raws of recomendation tables are generated with Logistic Regression model.
# Rest rows are computed on IMDB formula

def Recomendation_System(merged_table, user_id, model, t_ie):
    best_id, best_score = Predict_By_Model(merged_table, user_id, model)
    rest_table = t_ie.copy(deep=True)
    rest_table.index = rest_table.ie_id
    best_id_list = list(best_id)
    for index, row in table.iterrows():
        if row.ie_id in best_id_list:
            rest_table.drop(row.ie_id)
    imdb_id, imdb_score = IMDB_Reccomender(rest_table)

    out_table = {'recomended_id': best_id_list + list(imdb_id), 'predicted_score' : list(best_score) + list(imdb_score)}
    out_table = pd.DataFrame(out_table)
    out_table.predicted_score = out_table.predicted_score.round(decimals=2)
    return out_table


In [None]:
t_ie = tables[1]
recomendation = Recomendation_System(X_test, X_test.iloc[0].user_id, clf, t_ie)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [None]:
recomendation

Unnamed: 0,recomended_id,predicted_score
0,77,5.0
1,127,5.0
2,66,4.08
3,123,4.08
4,163,4.07
5,95,4.06
6,150,4.06
7,100,3.57
8,70,3.56
9,58,3.56


In [None]:
recomendation.to_csv("/content/drive/My Drive/datasets/2009-skoltech-hack/result_recomendation.csv")