In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from pprint import pprint

In [2]:
data = pd.read_csv('train_new.csv').sample(frac=1) #shuffle
data = data.loc[data['rating'].isin([1, 2, 3, 4, 5])]
data = data.fillna(0)
data = data.drop(['merchant_id', 'merchant_profile_picture', 'id', 'tags'], axis=1)

In [3]:
# splitting. alternative in sklearn train_test_split. 
# I use holdout method for validation, but you are encouraged to use anything else e.g. cross_validation
msk = np.random.rand(len(data)) < 0.7
tr = data[msk]
val = data[~msk]

In [4]:
# Processing the categorical features.
# Models do not understand string values by default, we need to encode them into integer.
# for example: 
# banana -> 1
# apple  -> 2
# 
# in order to do this, we need to build a mapping between string values and integer values.
# also there is a chance that in the testing set, some values do not exist in the training set, so we need a default one for those.
# for example, in the testing/validation set:
# pear -> -1 (pear does not exist)


# a dictionary that contains the mapping for each feature
dict_cat = {}


# columns that are of categorical value
cat_cols = tr.columns[tr.dtypes==object].to_list()



def cat_digit(col):  
    # build the mapping
    encoded = col.astype('category').cat.codes
    # store the mapping
    dict_cat[col.name] = dict(zip(np.asarray(col), np.asarray(encoded)))
    return encoded

# for each categorical feature, apply cat_digit where we build the mapping and transform the data
# this is for the training set (where we build the mapping)
tr[cat_cols] = tr[cat_cols].apply(lambda col: cat_digit(col))
tr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,price,retail_price,currency_buyer,units_sold,uses_ad_boosts,rating,rating_count,badges_count,badge_local_product,badge_product_quality,...,urgency_text,origin_country,merchant_title,merchant_name,merchant_info_subtitle,merchant_rating_count,merchant_rating,merchant_has_profile_picture,theme,crawl_month
987,22.0,19,0,20000,1,4.0,1103,0,0,0,...,0,1,451,367,422,4107,4.096908,0,0,0
65,5.9,5,0,10000,0,4.0,1933,0,0,0,...,0,1,286,71,47,2366,3.951817,0,0,0
341,15.0,115,0,50000,1,4.0,6735,0,0,0,...,0,1,53,139,561,165915,4.256077,0,0,0
252,7.0,6,0,100,0,5.0,27,1,0,0,...,0,1,381,251,39,18529,4.135949,0,0,0
410,12.0,11,0,100,0,4.0,30,0,0,0,...,1,0,125,284,11,105015,3.789601,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1036,7.0,68,0,10,0,2.0,2,0,0,0,...,1,1,510,470,77,397,3.858942,0,0,0
1053,1.8,2,0,1000,1,4.0,76,0,0,0,...,0,1,515,476,378,4230,4.041371,0,0,0
261,11.0,10,0,1000,0,5.0,410,0,0,0,...,0,1,35,81,177,1733,3.842470,0,0,0
1079,7.0,50,0,1000,1,4.0,408,0,0,0,...,1,1,390,260,306,12763,3.967484,0,0,0


In [5]:
print('categorical features')
pprint(list(dict_cat.keys()))

categorical features
['currency_buyer',
 'product_color',
 'product_variation_size_id',
 'shipping_option_name',
 'urgency_text',
 'origin_country',
 'merchant_title',
 'merchant_name',
 'merchant_info_subtitle',
 'theme',
 'crawl_month']


In [6]:
print('Lets see what the mapping for column origin_country :')
pprint(dict_cat['origin_country'])
print('It is a string to integer mapping')

Lets see what the mapping for column origin_country :
{0: 0, 'CN': 1, 'GB': 2, 'SG': 3, 'US': 4, 'VE': 5}
It is a string to integer mapping


In [7]:
# then we will use the mappings built from the training set, to transform the validation set
val[cat_cols] = val[cat_cols].apply(lambda col: col.map(dict_cat[col.name]))
# for string values that not seen in training set, we replace it with -1
val = val.fillna(-1)
val

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,price,retail_price,currency_buyer,units_sold,uses_ad_boosts,rating,rating_count,badges_count,badge_local_product,badge_product_quality,...,urgency_text,origin_country,merchant_title,merchant_name,merchant_info_subtitle,merchant_rating_count,merchant_rating,merchant_has_profile_picture,theme,crawl_month
959,8.0,8,0,10000,0,4.0,1858,1,0,1,...,0.0,1,-1.0,-1.0,-1.0,19879,3.994718,0,0,0
803,8.0,59,0,100,0,4.0,45,0,0,0,...,0.0,1,-1.0,-1.0,-1.0,58154,3.871359,1,0,0
197,6.0,6,0,20000,1,4.0,1600,0,0,0,...,0.0,1,213.0,481.0,-1.0,16013,4.108849,0,0,0
547,8.0,7,0,20000,1,5.0,3090,0,0,0,...,0.0,1,-1.0,-1.0,-1.0,21892,4.146857,0,0,0
98,11.0,11,0,1000,0,4.0,213,0,0,0,...,0.0,1,140.0,315.0,221.0,17120,3.898890,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
432,8.0,7,0,100,0,4.0,139,0,0,0,...,0.0,1,-1.0,-1.0,-1.0,12745,4.277991,0,0,0
749,8.0,7,0,10000,0,4.0,2396,0,0,0,...,0.0,1,-1.0,-1.0,-1.0,6329,4.162901,0,0,0
53,13.0,11,0,100,0,4.0,11,0,0,0,...,0.0,1,-1.0,-1.0,-1.0,38,3.473684,0,0,0
335,6.0,6,0,1000,1,4.0,895,0,0,0,...,0.0,1,164.0,427.0,324.0,25260,3.995606,0,0,0


In [8]:
tr_y = tr['rating']
tr_x = tr.drop('rating', axis=1)
clf = LogisticRegression().fit(tr_x, tr_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [9]:
val_y = val['rating']
val_x = val.drop('rating', axis=1)
pred_val = clf.predict(val_x)

In [10]:
val_score = f1_score(val_y, pred_val, average='micro')
print(val_score)

0.7507788161993769


In [11]:
# once you are happy with your local model, let's prepare a submission
# we need to apply the same preprocessing steps on the testing set as you did before you train the model

test_data = pd.read_csv('test_new.csv').sample(frac=1) 
_id = test_data['id']
test_data = test_data.fillna(0)
test_data = test_data.drop(['merchant_id', 'merchant_profile_picture', 'id', 'tags'], axis=1)
test_data[cat_cols] = test_data[cat_cols].apply(lambda col: col.map(dict_cat[col.name]))

# again, not-seen string value filled with -1
test_data = test_data.fillna(-1)

In [12]:
pred_test = clf.predict(test_data)
pred_df = pd.DataFrame(data={'id': np.asarray(_id), 'rating': pred_test})
pred_df.to_csv('pred_walkthrough.csv', index=False)