In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/student-shopee-code-league-sentiment-analysis/sampleSubmission.csv
/kaggle/input/student-shopee-code-league-sentiment-analysis/test.csv
/kaggle/input/student-shopee-code-league-sentiment-analysis/train.csv


# Loading training data

In [14]:
train_df = pd.read_csv('/kaggle/input/student-shopee-code-league-sentiment-analysis/train.csv', index_col=0)
train_df.head()

Unnamed: 0_level_0,review,rating
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Ga disappointed neat products .. Meletot Hilsn...,1
1,"Rdtanya replace broken glass, broken chargernya",1
2,Nyesel bngt dsni shopping antecedent photo mes...,1
3,Sent a light blue suit goods ga want a refund,1
4,Pendants came with dents and scratches on its ...,1


In [13]:
train_df.review_id.nunique()
len(train_df)
#review_id can be used as index

146811

In [9]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146811 entries, 0 to 146810
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   review  146811 non-null  object
 1   rating  146811 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.4+ MB


In [10]:
sum(train_df.rating.isnull())
# no null values in 'rating'

0

In [15]:
train_df.rating.value_counts()

4    41865
5    41515
3    35941
1    14785
2    12705
Name: rating, dtype: int64

# Trying Vader

In [16]:
pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 2.7 MB/s eta 0:00:01
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
Note: you may need to restart the kernel to use updated packages.


In [22]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyser = SentimentIntensityAnalyzer()

In [23]:
def mapping_all_class(df):
    res = []
    
    for product_title in df:
        vs = analyser.polarity_scores(product_title)
        
        if vs['compound'] >= 0.1:
            pred = 5
        elif 0.05 < vs['compound'] < 0.1:
            pred = 4
        elif -0.05 <= vs['compound'] <= 0.05:
            pred = 3
        elif -0.1 < vs['compound'] < -0.05:
            pred = 2
        else:
            pred = 1

        res.append(pred)
    
    return res

In [25]:
res = mapping_all_class(train_df.review)
train_df['vader_rating'] = res
train_df.head()

Unnamed: 0_level_0,review,rating,vader_rating
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Ga disappointed neat products .. Meletot Hilsn...,1,5
1,"Rdtanya replace broken glass, broken chargernya",1,1
2,Nyesel bngt dsni shopping antecedent photo mes...,1,5
3,Sent a light blue suit goods ga want a refund,1,4
4,Pendants came with dents and scratches on its ...,1,5


In [26]:
def accuracy_test(df):
    corr = 0
    wrong = 0

    for i in range(0, len(df)):
        if df['vader_rating'].iloc[i] == df['rating'].iloc[i]:
            corr += 1

        else:
            wrong += 1
    
    print('Correct predictions:', corr)
    print('Wrong predictions:', wrong)
    print('Total predictions:', len(df))
    
    accuracy_rate = corr / (corr + wrong)
    print('\nAccuracy rate:', '{:.2%}'.format(accuracy_rate))

In [27]:
accuracy_test(train_df)

Correct predictions: 49593
Wrong predictions: 97218
Total predictions: 146811

Accuracy rate: 33.78%


Test accuracy using above metrics: 39.6%

# Loading testing data

In [31]:
test_df = pd.read_csv('/kaggle/input/student-shopee-code-league-sentiment-analysis/test.csv')
test_df.head()

Unnamed: 0,review_id,review
0,1,"Great danger, cool, motif and cantik2 jg model..."
1,2,One of the shades don't fit well
2,3,Very comfortable
3,4,Fast delivery. Product expiry is on Dec 2022. ...
4,5,it's sooooo cute! i like playing with the glit...


In [36]:
sample = pd.read_csv('/kaggle/input/student-shopee-code-league-sentiment-analysis/sampleSubmission.csv')
sample.head()

Unnamed: 0,review_id,rating
0,0,4
1,1,3
2,2,5
3,3,1
4,4,2


In [32]:
res_test = mapping_all_class(test_df.review)
test_df['rating'] = res_test
test_df.head()

Unnamed: 0,review_id,review,rating
0,1,"Great danger, cool, motif and cantik2 jg model...",5
1,2,One of the shades don't fit well,1
2,3,Very comfortable,5
3,4,Fast delivery. Product expiry is on Dec 2022. ...,5
4,5,it's sooooo cute! i like playing with the glit...,5


In [47]:
submission_df = test_df.drop('review', axis=1)
# submission_df['review_id'] = submission_df.index
submission_df.head()

Unnamed: 0,review_id,rating
0,1,5
1,2,1
2,3,5
3,4,5
4,5,5


In [48]:
len(submission_df)

60427

In [49]:
submission_df.to_csv('submission_vader_1.csv', index=False)