In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt
from sklearn.preprocessing import LabelEncoder

In [2]:
train_csv = pd.read_csv('./train.csv')
test_csv = pd.read_csv('./test.csv')
train_csv.head()

Unnamed: 0,Viewers_ID,Joke_identifier,Response_ID,Rating
0,A1,Klint De Drunk Enugu 1,A1_Klint De Drunk Enugu 1,0.11
1,A1,Klint De Drunk Enugu 2,A1_Klint De Drunk Enugu 2,-4.64
2,A1,Klint De Drunk PH 1,A1_Klint De Drunk PH 1,-3.39
3,A1,Klint De Drunk PH 2,A1_Klint De Drunk PH 2,0.44
4,A1,Klint De Drunk Lagos 1,A1_Klint De Drunk Lagos 1,-4.83


In [0]:
train_data = train_csv.copy()
test_data = test_csv.copy()

# Data Encoding

Change viewers id  and jokes id text data to their numeric values.

In [0]:
def redefine_viewers_id(dataset):
    dataset['Viewers_ID'] = dataset['Viewers_ID'].str.slice(1)

redefine_viewers_id(train_data)
redefine_viewers_id(test_data)
train_data['Viewers_ID'] = train_data['Viewers_ID'].astype('int32')
test_data['Viewers_ID'] = test_data['Viewers_ID'].astype('int32')

In [0]:
jokes = train_data['Joke_identifier'].unique()
test_jokes = test_data['Joke_identifier'].unique()
jokes = np.append(jokes, test_jokes)
jokes = np.unique(jokes)
le = LabelEncoder()
le.fit(jokes)
def create_joke_id(dataset):
    dataset['Jokes_ID'] = le.transform(dataset['Joke_identifier'])
    
create_joke_id(train_data)
create_joke_id(test_data)

# Data Splitting

Split joke identifier into comedians, version and place. This is important for the data augmentation process.

In [0]:
train_data['Version'] = train_data['Joke_identifier'].apply(lambda x: x.split()[-1])
test_data['Version'] = test_data['Joke_identifier'].apply(lambda x: x.split()[-1])

In [0]:
train_data['Version'] = train_data['Version'].astype('int32')
test_data['Version'] = test_data['Version'].astype('int32')

In [0]:
test_data['Place'] = test_data['Joke_identifier'].apply(lambda x: x.split()[-2])
train_data['Place'] = train_data['Joke_identifier'].apply(lambda x: x.split()[-2])
test_data['Commedian'] = test_data['Joke_identifier'].apply(lambda x: x.split()[0:-2])
train_data['Commedian'] = train_data['Joke_identifier'].apply(lambda x: x.split()[0:-2])

In [0]:
test_data['Commedian'] = test_data['Commedian'].apply(lambda x: " ".join(x))
train_data['Commedian'] = train_data['Commedian'].apply(lambda x: " ".join(x))

# Data Correction And Encoding
Encode this splitted values. This step also includes correcting some of the wrong values produced by the process above.

In [10]:
commedians = train_data['Commedian'].unique()
commedians = np.append(commedians, test_data['Commedian'].unique())
commedians = np.unique(commedians)
commedians

array(['AY', 'Akpororo', 'AliBaba', 'AliBaba Aso', 'BasketMouth', 'Bovi',
       'Funny Bone', 'Gordons', 'Helen Paul', 'I Go Dye',
       'Klint De Drunk', 'MisterIbu', 'Okey Bakassi', 'Saka', 'Seyi Law'],
      dtype=object)

In [0]:
commedians = np.delete(commedians, 3)

In [0]:
places = np.unique(np.append(train_data['Place'].unique(), test_data['Place'].unique()))

In [13]:
places[7] = "Aso Rock"
places

array(['Abuja', 'Benin', 'Enugu', 'Ibadan', 'Lagos', 'Owerri', 'PH',
       'Aso Rock', 'Uyo', 'Warri'], dtype=object)

In [0]:
def repair_commedian(x):
    if x["Commedian"] == "AliBaba Aso":
        return "AliBaba"
    else:
        return x["Commedian"]

def repair_place(x):
    if x["Place"] == "Rock":
        return "Aso Rock"
    else:
        return x["Place"]
train_data["Place"] = train_data.apply(repair_place, axis=1)
train_data["Commedian"] = train_data.apply(repair_commedian, axis=1)

In [15]:
train_data.sample(15)

Unnamed: 0,Viewers_ID,Joke_identifier,Response_ID,Rating,Jokes_ID,Version,Place,Commedian
383058,32536,Klint De Drunk Enugu 2,A32536_Klint De Drunk Enugu 2,1.91,97,2,Enugu,Klint De Drunk
602592,9390,Klint De Drunk PH 2,A9390_Klint De Drunk PH 2,0.61,103,2,PH,Klint De Drunk
21301,1133,Klint De Drunk Enugu 2,A1133_Klint De Drunk Enugu 2,0.19,97,2,Enugu,Klint De Drunk
9525,10596,AliBaba PH 5,A10596_AliBaba PH 5,0.89,36,5,PH,AliBaba
208780,22819,Klint De Drunk Lagos 1,A22819_Klint De Drunk Lagos 1,3.69,99,1,Lagos,Klint De Drunk
293037,27462,Klint De Drunk Enugu 3,A27462_Klint De Drunk Enugu 3,0.44,98,3,Enugu,Klint De Drunk
275732,26461,I Go Dye Uyo 2,A26461_I Go Dye Uyo 2,4.03,95,2,Uyo,I Go Dye
159608,20069,BasketMouth Lagos 1,A20069_BasketMouth Lagos 1,2.53,40,1,Lagos,BasketMouth
347036,30568,AY Lagos 3,A30568_AY Lagos 3,3.06,5,3,Lagos,AY
86540,15745,AliBaba Abuja 1,A15745_AliBaba Abuja 1,-1.91,22,1,Abuja,AliBaba


In [0]:
test_data["Place"] = test_data.apply(repair_place, axis=1)
test_data["Commedian"] = test_data.apply(repair_commedian, axis=1)

In [0]:
le = LabelEncoder()
le.fit(commedians)
def create_commedian_id(dataset):
    dataset['Commedian_ID'] = le.transform(dataset['Commedian'])
    
create_commedian_id(train_data)
create_commedian_id(test_data)

In [0]:
le = LabelEncoder()
le.fit(places)
def create_place_id(dataset):
    dataset['Place_ID'] = le.transform(dataset['Place'])
    
create_place_id(train_data)
create_place_id(test_data)

In [19]:
train_data.head()

Unnamed: 0,Viewers_ID,Joke_identifier,Response_ID,Rating,Jokes_ID,Version,Place,Commedian,Commedian_ID,Place_ID
0,1,Klint De Drunk Enugu 1,A1_Klint De Drunk Enugu 1,0.11,96,1,Enugu,Klint De Drunk,9,3
1,1,Klint De Drunk Enugu 2,A1_Klint De Drunk Enugu 2,-4.64,97,2,Enugu,Klint De Drunk,9,3
2,1,Klint De Drunk PH 1,A1_Klint De Drunk PH 1,-3.39,102,1,PH,Klint De Drunk,9,7
3,1,Klint De Drunk PH 2,A1_Klint De Drunk PH 2,0.44,103,2,PH,Klint De Drunk,9,7
4,1,Klint De Drunk Lagos 1,A1_Klint De Drunk Lagos 1,-4.83,99,1,Lagos,Klint De Drunk,9,5


# Augmentation Process
This process looks for low variance relationship (in terms of ratings) between a user and a comedian, place and version in the training set and when it finds it, it uses the mean of the ratings as a predicted value in the test row.

If it does not find it, it gives that test row a rating of zero. It then looks for all the non-zero ratings rows in the test set and then adds them all to the training set. 

Note that it ignores comedians which have large number of rows in the train set and sets their test set values to zero.

In [0]:
"""
  Get the pandas group for a particular user by first trying to get it from the 
  cache. If the group is already stored in the cache, it returns it but if not 
  it calculates it with pandas and the adds the group to the cache. This speeds 
  up the data augmentation process
""" 
pandas_dict = {}
def get_data_group(user_id):
  if user_id in pandas_dict:
    user_data = pandas_dict[user_id]
  else:
    user_data = train_data.groupby('Viewers_ID').get_group(user_id)
    pandas_dict[user_id] = user_data
  return user_data

In [0]:
#Function for predicting the rating for a commedian using low variance condition.
def predict_rating(data):
  ignore_commedians = ['AY', 'Akpororo', 'AliBaba', 'BasketMouth', 'Bovi', 'I Go Dye', 'Klint De Drunk']
  user_id = data['Viewers_ID']
  user_place = data['Place_ID']
  user_commedian = data['Commedian_ID']
  user_version = data['Version']
  user_group = get_data_group(user_id)
  if data['Commedian'] in ignore_commedians:
    return 0
  if user_commedian in user_group['Commedian_ID'].values:
    commedian_group = user_group.groupby('Commedian_ID').get_group(user_commedian)
    if commedian_group['Rating'].count() > 1:
      if commedian_group['Rating'].std() <= 1 and commedian_group['Rating'].std() >= -1:
        rating = commedian_group['Rating'].mean()
        return rating
      
  if user_place in user_group['Place_ID'].values:
    place_group = user_group.groupby('Place_ID').get_group(user_place)
    if place_group['Rating'].count() > 1:
      if place_group['Rating'].std() <= 0.6 and place_group['Rating'].std() >= -0.6:
        rating = place_group['Rating'].mean()
        return rating
      
  if user_version in user_group['Version'].values:
    version_group = user_group.groupby('Version').get_group(user_version)
    if version_group['Rating'].count() > 1:
      if version_group['Rating'].std() <= 0.6 and version_group['Rating'].std() >= -0.6:
        rating = version_group['Rating'].mean()
        return rating
  return 0

In [23]:
def define_prediction(x):
  pred = predict_rating(x)
  return pred

test_data['Rating'] = test_data.apply(define_prediction, axis=1)

KeyboardInterrupt: ignored

In [0]:
nonzero_indices = test_data['Rating'].nonzero()

In [0]:
nonzero_values = test_data.iloc[nonzero_indices]

In [56]:
nonzero_values.describe()

Unnamed: 0,Viewers_ID,Jokes_ID,Version,Commedian_ID,Place_ID,Rating
count,59585.0,59585.0,59585.0,59585.0,59585.0,59585.0
mean,21297.304573,95.321272,2.777125,8.802903,2.685139,1.329253
std,11999.491196,25.652427,1.193094,2.829196,2.407945,2.61066
min,6.0,60.0,1.0,5.0,0.0,-5.0
25%,10406.0,71.0,2.0,6.0,0.0,0.111667
50%,21954.0,81.0,3.0,7.0,4.0,1.445
75%,32050.0,120.0,4.0,12.0,5.0,3.51
max,40855.0,138.0,5.0,13.0,5.0,5.0


In [0]:
train_data = train_data.append(nonzero_values)
len(train_data)

In [25]:
train_data.head()

Unnamed: 0,Commedian,Commedian_ID,Joke_identifier,Jokes_ID,Place,Place_ID,Rating,Response_ID,Version,Viewers_ID
0,Klint De Drunk,9,Klint De Drunk Enugu 1,96,Enugu,3,0.11,A1_Klint De Drunk Enugu 1,1.0,1.0
1,Klint De Drunk,9,Klint De Drunk Enugu 2,97,Enugu,3,-4.64,A1_Klint De Drunk Enugu 2,2.0,1.0
2,Klint De Drunk,9,Klint De Drunk PH 1,102,PH,7,-3.39,A1_Klint De Drunk PH 1,1.0,1.0
3,Klint De Drunk,9,Klint De Drunk PH 2,103,PH,7,0.44,A1_Klint De Drunk PH 2,2.0,1.0
4,Klint De Drunk,9,Klint De Drunk Lagos 1,99,Lagos,5,-4.83,A1_Klint De Drunk Lagos 1,1.0,1.0


# Training and Prediction 
Now that data augmentation process is done, I can now use the svd process from the 
surprise package to train and predict using the augmentated data.

In [27]:
!pip install scikit-surprise==1.0.5

Collecting scikit-surprise==1.0.5
[?25l  Downloading https://files.pythonhosted.org/packages/a4/a3/70a00e68fd212b37ad4a14d9e3bc417ee11535c036c2534bf9c3bd4c6876/scikit-surprise-1.0.5.tar.gz (2.4MB)
[K    100% |████████████████████████████████| 2.4MB 5.0MB/s 
Building wheels for collected packages: scikit-surprise
  Running setup.py bdist_wheel for scikit-surprise ... [?25l- \ | / - \ | / - \ | / - \ | done
[?25h  Stored in directory: /root/.cache/pip/wheels/f8/5e/34/656c22616c1a80ccb5fda84e8f116537dc6a0412c2203f57ad
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.0.5


In [0]:
from surprise.model_selection import train_test_split
from surprise import SVD

In [0]:
from surprise import accuracy, Reader, Dataset

reader = Reader(rating_scale=(-5, 5))
data_train = Dataset.load_from_df(train_data[['Viewers_ID', 'Jokes_ID', 'Rating']], reader)
test_data['Rating'] = 0
data_test = Dataset.load_from_df(test_data[['Viewers_ID', 'Jokes_ID', 'Rating']], reader)

In [0]:
from tqdm import tqdm

In [31]:
trainset, testset = train_test_split(data_train, train_size=0.75)
algo = SVD(n_epochs=50, lr_all=0.005, reg_all=0.2)
tqdm(algo.fit(trainset))
predictions = algo.test(testset)
accuracy.rmse(predictions)

0it [00:00, ?it/s]


RMSE: 2.1292


2.1291750559603764

In [33]:
trainset = data_train.build_full_trainset()
algo = SVD(n_epochs=50, lr_all=0.005, reg_all=0.2)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f94a8b125c0>

In [0]:
def define_prediction(x):
  pred = algo.predict(x['Viewers_ID'], x['Jokes_ID'])
  return pred.est

test_data['Rating'] = test_data.apply(define_prediction, axis=1)

In [0]:
result = test_data[['Response_ID', 'Rating']]

In [0]:
result.to_csv('submission.csv', index=None)