# INIT LANGUAGE CLASSIFIER

In [43]:
from pathlib import Path
import random
from collections import Counter, defaultdict
import numpy as np
import pandas as pd
from sklearn.neighbors import *
from matplotlib import pyplot as plt
from mpl_toolkits import mplot3d


def extract_features(text, features):
    '''Extracts all alphabetic characters and add their ratios as feature
    
    extract_features(text, features) -> np.array
    '''
    textlen = len(text)
    ratios = []
    text = text.lower()
    for feature in features:
        ratios.append(text.count(feature) / textlen)
    return np.array(ratios)

def predict_lang(text, clf):
    '''Predicts the language of a given text and classifier
    
    predict_lang(text, clf) -> str
    '''
    extracted_features = extract_features(text, features)
    return clf.predict(np.array(np.array([extracted_features])))[0]

def train_knn(x, y, k):
    '''Returns the trained k nearest neighbors classifier
    
    train_knn(x, y, k) -> sklearn.neighbors.KNeighborsClassifier
    '''
    clf = KNeighborsClassifier(k)
    clf.fit(x, y)
    return clf

def test_knn(clf, X, Y):
    '''Tests a given classifier with a testset and return result
    
    text_knn(clf, X, Y) -> float
    '''
    predictions = clf.predict(X)
    ratio_correct = len([i for i in range(len(Y)) if Y[i] == predictions[i]]) / len(Y)
    return ratio_correct

def read(file):
    '''Returns contents of a file'''
    with open(file, 'r', errors='ignore') as f:
        text = f.read()
    return text

def load_eu_texts():
    '''Read texts snipplets in 10 different languages into pd.Dataframe

    load_eu_texts() -> pd.Dataframe
    
    The text snipplets are taken from the nltk-data corpus.
    '''
    basepath = Path('/Users/gohost/nltk_data/corpora/europarl_raw/')
    df = pd.DataFrame(columns=['text', 'lang', 'len'])
    languages = [None]
    for lang in basepath.iterdir():
        languages.append(lang.as_posix())
        t = '\n'.join([read(p) for p in lang.glob('*')])
        d = pd.DataFrame()
        d['text'] = ''
        d['text'] = pd.Series(t.split('\n'))
        d['lang'] = lang.name.title()
        df = df.append(d.copy(), ignore_index=True)
    return df

def clean_eutextdf(df):
    '''Preprocesses the texts by doing a set of cleaning steps
    
    clean_eutextdf(df) -> cleaned_df
    '''
    # Cuts of whitespaces a the beginning and and
    df['text'] = [i.strip() for i in df['text']]
    # Generate a lowercase Version of the text column
    df['ltext'] = [i.lower() for i in df['text']]

    # Determining the length of each text
    df['len'] = [len(i) for i in df['text']]
    # Drops all texts that are not at least 200 chars long
    df = df.loc[df['len'] > 200]
    return df

def calc_charratios(df):
    '''Calculating ratio of any (alphabetical) char in any text of df for each lyric
    
    calc_charratios(df) -> list, pd.Dataframe
    '''
    CHARS = ''.join({c for c in ''.join(df['ltext']) if c.isalpha()})
    print('Counting Chars:')
    for c in CHARS:
        print(c, end=' ')
        df[c] = [r.count(c) for r in df['ltext']] / df['len']
    return list(CHARS), df

def split_dataset(df, ratio=0.5):
    '''Split the dataset into a train and a test dataset
    
    split_dataset(featuredf, ratio) -> pd.Dataframe, pd.Dataframe
    '''
    df = df.sample(frac=1).reset_index(drop=True)
    traindf = df[:][:int(df.shape[0] * ratio)]
    testdf = df[:][int(df.shape[0] * ratio):]
    return traindf, testdf

# Execute the above functions to load the texts
df = clean_eutextdf(load_eu_texts())
 
# Print a few stats of the read texts
textline = 'Number of text snippplets: ' + str(df.shape[0])
print('\n' + textline + '\n' + ''.join(['_' for i in range(len(textline))]))
c = Counter(df['lang'])
for l in c.most_common():
    print('%-25s' % l[0] + str(l[1]))
df.sample(10)

features, df = calc_charratios(df)

featuredf = pd.DataFrame()
featuredf['lang'] = df['lang']
for feature in features:
    featuredf[feature] = df[feature]
traindf, testdf = split_dataset(featuredf, ratio=0.80)
 
x = np.array([np.array(row[1:]) for index, row in traindf.iterrows()])
y = np.array([l for l in traindf['lang']])
X = np.array([np.array(row[1:]) for index, row in testdf.iterrows()])
Y = np.array([l for l in testdf['lang']])

print('''k\tPercentage of correctly predicted language
__________________________________________________''')
for i in range(1, 16):
    clf = train_knn(x, y, i)
    ratio_correct = test_knn(clf, X, Y)
    print(str(i) + '\t' + str(round(ratio_correct * 100, 3)) + '%')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,



Number of text snippplets: 63156
________________________________
Greek                    6681
French                   6466
German                   6395
Italian                  6383
Portuguese               6147
Spanish                  6016
Finnish                  5597
Swedish                  4940
Danish                   4914
Dutch                    4826
English                  4791
Counting Chars:
ι θ º â û e ΐ t φ d ä g ε α ú w y ì i γ κ ô ΰ ϋ δ h ζ a ó ύ ç m τ æ o π r ò z ο λ é ü ø f õ μ έ s ñ ϊ ù ê η c ς å ï p b è k ρ v ά î à ψ ξ ë β ö j u ν ª ω ή ã χ ό ί x í l q n ß υ á ώ σ k	Percentage of correctly predicted language
__________________________________________________
1	97.973%
2	98.005%
3	98.417%
4	98.377%
5	98.607%
6	98.567%
7	98.63%
8	98.615%
9	98.686%
10	98.638%
11	98.654%
12	98.678%
13	98.678%
14	98.67%
15	98.686%


# IMPORT LABELED DATA

In [4]:
import pandas as pd

In [6]:
# import from requirements engineering
l_req_eng = pd.read_excel("data/LData.xlsx", index_col='Unnamed: 0')
l_req_eng.head(1)

Unnamed: 0,app_name,user_id,user_name,date,country,version,score,topic,review,url,review_id,category_final,sentiment_final,req_final
52505.0,Facebook,246193109,Help is herr,2017-06-21 00:00:00,United States,97,1,Notifications not showing up,The notification badges are showing up on my i...,https://itunes.apple.com/WebObjects/MZStore.wo...,53163.0,requirement,neutral,functional


In [9]:
l_req_eng.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 3000 entries, 52505.0 to 45413.0
Data columns (total 14 columns):
app_name           3000 non-null object
user_id            3000 non-null object
user_name          2993 non-null object
date               3000 non-null object
country            3000 non-null object
version            3000 non-null object
score              3000 non-null int64
topic              2994 non-null object
review             2996 non-null object
url                2850 non-null object
review_id          2850 non-null float64
category_final     3000 non-null object
sentiment_final    2796 non-null object
req_final          1081 non-null object
dtypes: float64(1), int64(1), object(12)
memory usage: 351.6+ KB


In [30]:
l_req_eng.category_final.unique()

array(['requirement', 'other', 'bug report', 'noise', 'Requirement',
       'Bug Report', 'Other', 'Noise'], dtype=object)

In [28]:
# import from culture study
import sqlite3

db_path = "data/reviews_with_truthset/reviews_with_truthset.db"

conn = sqlite3.connect(db_path)
conn.text_factory = lambda b: b.decode(errors = 'ignore')

l_culture = pd.read_sql_query("SELECT * FROM labeledreviews", conn, index_col='id')
l_culture.head(1)

Unnamed: 0_level_0,app_name,user_id,user_name,date,country,version,score,topic,review_text,url,...,isBugReport,isUsageScenario,isGeneralPraise,isGeneralComplaint,isNoise,isOther,otherText,isNotSureOfLabels,comment,gender
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Facebook,168083869,Enamul,2017-06-09 00:00:00,Singapore,96,5,Nice,It's helpful for all communication,https://itunes.apple.com/WebObjects/MZStore.wo...,...,0,0,1,0,0,0,,0,,male


In [29]:
# remove unnecessary columns
l_culture = l_culture.drop(['user_id', 'url', 'review_id', 'version', 'user_name',
                            'date', 'annotatingTime', 'otherText', 'isNotSureOfLabels',
                           'version', 'isAnnotated', 'annotator_id', 'comment'], axis=1)
l_culture.head(1)

Unnamed: 0_level_0,country,score,topic,review_text,sentimentScore,NotEnglish,isFeatureShortcoming,isFeatureStrength,isFeatureRequest,isBugReport,isUsageScenario,isGeneralPraise,isGeneralComplaint,isNoise,isOther,gender
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,Singapore,5,Nice,It's helpful for all communication,1,0,0,0,0,0,0,1,0,0,0,male


In [10]:
l_culture.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2560 entries, 1 to 419
Data columns (total 29 columns):
app_name                2560 non-null object
user_id                 2560 non-null int64
user_name               2560 non-null object
date                    2560 non-null object
country                 2560 non-null object
version                 2560 non-null object
score                   2560 non-null int64
topic                   2560 non-null object
review_text             2560 non-null object
url                     2560 non-null object
review_id               2560 non-null int64
annotator_id            2560 non-null int64
isAnnotated             2560 non-null int64
annotatingTime          2560 non-null object
sentimentScore          2560 non-null int64
NotEnglish              2560 non-null int64
isFeatureShortcoming    2560 non-null int64
isFeatureStrength       2560 non-null int64
isFeatureRequest        2560 non-null int64
isBugReport             2560 non-null int64
isUsa

# REMOVING ALL NON-ENGLISH REVIEWS

# * FROM THE CULTURE STUDY

In [60]:
# the language classifier operates on a character level
# doesn't really work as well as advertised, but at least it did identify
# a number of French and Spanish/Portuguese reviews I can remove

l_culture_copy = l_culture.copy()

for index, row in l_culture_copy.iterrows():
    review = row['review_text']
    
    if len(review) < 100 or type(review) != str:
        continue
        
    language = predict_lang(review, clf)
        
    if language != 'English':
        l_culture_copy.drop([index])
        print(f"[{index}] {language}: {review}\n")


[12] Portuguese: My news feed shows only stuff from 4 days ago. I see maybe 1 new post to 20 from 4 days ago. Stop messing up a good app

[112] Italian: INSTAGRAM NEEDS TO GO BACK TO CHRONOLOGICAL ORDER!!! IM SICK OF LOOKIN LIKE A STALKER LIKER A POST FROM 5 DAYS AGO!!!

[183] German: Since the update it keeps freezing when I'm trying to up load pics and just in general. Please fix �

[302] Portuguese: Be a group r folder of pics it's good update there should be another update required is send or forward the same group of pics So no need to select all photos again to send some one else.

[314] French: Depuis une quinzaine de jours le Scrabble va mal. Je ne peux pas jouer avec mes amis Facebook, sauf deux. J'ai tout essayé...je suis très déçue car je jouais très très souvent.
Pouvez-vous faire quelque chose? Merci!

[334] Dutch: Since the update I have not been able to delete images without downloading them. Please enable that again.

[388] Spanish: Been great speaking to my dad in New 

[317] Italian: Great navigation app. I especially love the option to predict my trip time in advance and navigating around traffic. I find it to be about 80% accurate.

[373] Italian: App is good.but make a redesign or makeover in UI.pls its a request
 Please bring applock in ios.can u?

[389] French: Je n'apprécie pas du tout le design pour les commentaires et les réglages. J'aimais mieux comment c'était avant, maintenant, c'est moins clair. Je trouve aussi qu'il y a trop souvent de mises à jour, il faut se réadapter à chaque fois. 
 
 Merci de lire nos commentaires.

[419] French: J'utilise quotidiennement une application pour les routes pour m'indiquer principalement les bouchons et les autres chemins que je peux emprunter. Waze Est bien mais avec quelques bugs. Il est tres convivial, ce qui est appr̩ciable mais pas aussi fiable que son principal concurrent (et parent, en plus)



In [63]:
# manually curated list from reading the output above
foreign_indices = [314, 96, 384, 204, 300, 186, 277, 332, 140, 247, 251, 389, 419]

In [64]:
l_culture_copy = l_culture_copy.drop(foreign_indices)

In [66]:
# export the new (mostly English) labelled data
l_culture_copy.to_csv('english_culture_study_labels.csv', encoding='utf-8')

In [69]:
imp = pd.read_csv('english_culture_study_labels.csv', index_col='id')

In [70]:
imp.head()

Unnamed: 0_level_0,country,score,topic,review_text,sentimentScore,NotEnglish,isFeatureShortcoming,isFeatureStrength,isFeatureRequest,isBugReport,isUsageScenario,isGeneralPraise,isGeneralComplaint,isNoise,isOther,gender
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,Singapore,5,Nice,It's helpful for all communication,1,0,0,0,0,0,0,1,0,0,0,male
2,United States,3,Chronological Order,PLEASE GO BACK TO CHRONOLOGICAL ORDER ON THE T...,0,0,1,0,1,0,0,0,0,0,0,unclear
3,Australia,5,Great time saver,Highly recommend,2,0,0,0,0,0,0,1,0,0,0,unclear
4,South Africa,5,Instagram App of the best I've used,"Works every time, all the time. Easy to use, p...",2,0,0,1,0,0,0,0,0,0,0,female
5,Singapore,1,No response for multiple photo upload,"After many times I have tried, there still hav...",0,0,0,0,0,1,0,0,0,0,0,unclear


In [71]:
len(imp)

2486

# * FROM REQUIREMENTS ENGINEERING

In [98]:
l_req_copy = l_req_eng.copy()
l_req_copy = l_req_copy.reset_index(drop=True)
l_req_copy.head()

Unnamed: 0,app_name,user_id,user_name,date,country,version,score,topic,review,url,review_id,category_final,sentiment_final,req_final
0,Facebook,246193109,Help is herr,2017-06-21 00:00:00,United States,97,1,Notifications not showing up,The notification badges are showing up on my i...,https://itunes.apple.com/WebObjects/MZStore.wo...,53163.0,requirement,neutral,functional
1,Facebook,43034279,javamdnss,2017-06-16 00:00:00,United States,97,1,Hate it!,Why do they make changes we don't need? Now th...,https://itunes.apple.com/WebObjects/MZStore.wo...,53905.0,other,very negative,
2,Facebook,496978255,,2017-05-27 00:00:00,Hong Kong,94,1,Useless function n poor experience,Story is useless n annoying to user. \nCan't s...,https://itunes.apple.com/WebObjects/MZStore.wo...,47401.0,other,negative,
3,Facebook,139595037,Gilbertiggy,2017-05-26 00:00:00,United Kingdom,94,1,To many updates!,This app is always having an update for someth...,https://itunes.apple.com/WebObjects/MZStore.wo...,42233.0,requirement,negative,functional
4,Facebook,180832062,Princess Lou 24,2017-06-01 00:00:00,United Kingdom,94,1,Photo albums,Just spent an hour trying to upload photos and...,https://itunes.apple.com/WebObjects/MZStore.wo...,42066.0,requirement,negative,non-functional


In [99]:
len(l_req_copy)

3000

In [101]:
for index, row in l_req_copy.iterrows():
    review = row['review']
    
    if type(review) != str or len(review) < 100:
        continue
        
    language = predict_lang(review, clf)
        
#     if language != 'English':
#         l_req_copy.drop([index]) # KEEP COMMENTED OUT! Language classifier not precise enough to auto drop.
#         print(f"[{index}] {language}: {review}\n")

len(l_req_copy)

3000

In [102]:
l_req_foreign = [56, 175, 351, 382, 473, 558, 1023,
                 1199, 1550, 1642, 1667, 2050, 2077,
                 2167, 2305, 2348, 2388, 2416, 2508,
                 2530, 2587, 2674, 2694, 2726, 2734,
                 2759, 2764, 2765, 2833
                ]

In [103]:
l_req_copy = l_req_copy.drop(l_req_foreign)
len(l_req_copy)

2971

In [84]:
# export the new (mostly English) labelled data
l_culture_copy.to_csv('english_req_eng_labels.csv', encoding='utf-8')

# REMOVE NAN ROWS

In [149]:
l_req_nan = l_req_copy.copy()
l_cul_nan = l_culture_copy.copy()
print(len(l_req_nan), len(l_cul_nan))

2971 2486


In [150]:
l_req_nan = l_req_nan.dropna(subset=['review'])
l_cul_nan = l_cul_nan.dropna(subset=['review_text'])

In [151]:
print(len(l_req_nan), len(l_cul_nan))

2967 2486


# REMOVE SHORT REVIEWS

# RENAME COLUMNS

In [153]:
l_req_nan.head(1)

Unnamed: 0,app_name,user_id,user_name,date,country,version,score,topic,review,url,review_id,category_final,sentiment_final,req_final
0,Facebook,246193109,Help is herr,2017-06-21 00:00:00,United States,97,1,Notifications not showing up,The notification badges are showing up on my i...,https://itunes.apple.com/WebObjects/MZStore.wo...,53163.0,requirement,neutral,functional


In [158]:
# remove unnecessary columns
# rename columns also
good_req = l_req_nan.copy()
good_req = good_req.drop(['app_name', 'user_name', 'user_id', 'date', 'version', 'review_id', 'url',
                         'req_final'], axis=1)
good_req = good_req.rename(columns = {
    "category_final": "category",
    "sentiment_final": "sentiment"
}, errors="raise")

In [159]:
good_req.head(5)

Unnamed: 0,country,score,topic,review,category,sentiment
0,United States,1,Notifications not showing up,The notification badges are showing up on my i...,requirement,neutral
1,United States,1,Hate it!,Why do they make changes we don't need? Now th...,other,very negative
2,Hong Kong,1,Useless function n poor experience,Story is useless n annoying to user. \nCan't s...,other,negative
3,United Kingdom,1,To many updates!,This app is always having an update for someth...,requirement,negative
4,United Kingdom,1,Photo albums,Just spent an hour trying to upload photos and...,requirement,negative


In [142]:
len(l_cul_nan)

2486

In [143]:
## First we collapse all the columns in 'l_cul_nan' to be the same as 'l_req_nan'
l_cul = {
    'id' : [],
    'country' : [],
    'score' : [],
    'topic' : [],
    'review' : [],
    'category' : [],
    'sentiment' : []
}

for index, row in l_cul_nan.iterrows():
    if row['NotEnglish'] == 1:
        print(f"Skipping non-English: {row['review_text']}")
        continue
    
    required_cols = ['isFeatureShortcoming', 'isFeatureStrength', 'isFeatureRequest',
                    'isUsageScenario', 'isNoise', 'isBugReport', 'isGeneralComplaint',
                    'isOther', 'isGeneralPraise']
    col_counter = 0
    for col in required_cols:
        if row[col] == 0:
            col_counter+=1
    if col_counter == 9:
        print(f"Skipping indeterminate category review: {row['review_text']}")
        continue
        # otherwise, we knot it had a '1' for at least one column, that's all we need to make a decision
    
    if row['isNoise'] == 1:
        l_cul['category'].append("noise")
    elif row['isFeatureStrength'] == 1 or row['isUsageScenario'] == 1 or row['isFeatureRequest']:
        l_cul['category'].append("requirement")
    elif row['isBugReport'] == 1 or row['isFeatureShortcoming'] == 1:
        l_cul['category'].append("bug report")
    elif row['isOther'] == 1 or row['isGeneralPraise'] == 1 or row['isGeneralComplaint'] == 1:
        l_cul['category'].append("other")
    else:
        print(f"Couldn't figure out category for {row}")
        l_cul['category'].append("NA")
    
    l_cul['id'].append(index)
    l_cul['country'].append(row['country'])
    l_cul['score'].append(row['score'])
    l_cul['topic'].append(row['topic'])
    l_cul['review'].append(row['review_text'])
    
    s = row['sentimentScore']
    if s == -1 or s == -2:
        l_cul['sentiment'].append('negative')
    elif s == 0:
        l_cul['sentiment'].append('neutral')
    elif s == 1 or s == 2:
        l_cul['sentiment'].append('positive')
    else:
        print(f"Couldn't figure out sentiment for {row}")
        l_cul['sentiment'].append("NA")

good_cul = pd.DataFrame.from_dict(l_cul)
good_cul.index = good_cul['id']
good_cul = good_cul.drop('id', axis=1)
len(good_cul)

Skipping non-English: It nice apps but glti sy koi pic send ho jaa hai or vaps delete krni hai dono side sy to nhi ho rhi hai so..!
Skipping non-English: Bien pratique !
Skipping non-English: J'adore cette apps
Skipping indeterminate category review: I have found the app very useful in helping me track my diet and travel on a (fairly) healthy road
Skipping non-English: WoW Instagram ses super ?t même quand ta pas le réseau tu à droit à ta page d'accueil
Skipping non-English: Szybki update, czytelna, bardzo pomocna i bardziej aktualna od navi np z Google czy Apple
Skipping non-English: La version premium vaut la peine. Je trouve ça génial !
Skipping non-English: Superbe pour l'Ã©change d'idÃ©es !
Skipping non-English: Il est ou la traduction wtf disparu merci mise a jour de ?
Skipping non-English: Merecedor de 4 estrelas, só não ganha 5 por causa das propagandas inúteis.
Skipping non-English: Gros irritant, les vidéos devraient se lancer sur demande pas sur affichage.
Skipping non-Engli

2468

In [144]:
good_cul.head(5)

Unnamed: 0_level_0,country,score,topic,review,category,sentiment
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Singapore,5,Nice,It's helpful for all communication,other,positive
2,United States,3,Chronological Order,PLEASE GO BACK TO CHRONOLOGICAL ORDER ON THE T...,requirement,neutral
3,Australia,5,Great time saver,Highly recommend,other,positive
4,South Africa,5,Instagram App of the best I've used,"Works every time, all the time. Easy to use, p...",requirement,positive
5,Singapore,1,No response for multiple photo upload,"After many times I have tried, there still hav...",bug report,neutral


In [162]:
print(len(good_req), len(good_cul))

2967 2468


# LOWERCASE LABELS (5-class to 3-class, etc.)

In [163]:
good_cul.category.unique()

array(['other', 'requirement', 'bug report', 'noise'], dtype=object)

In [170]:
good_cul.sentiment.unique()

array(['positive', 'neutral', 'negative'], dtype=object)

In [164]:
good_req.category.unique()

array(['requirement', 'other', 'bug report', 'noise', 'Requirement',
       'Bug Report', 'Other', 'Noise'], dtype=object)

In [171]:
good_req.sentiment.unique()

array(['neutral', 'very negative', 'negative', 'positive', nan,
       'very positive', 'Neutral', 'Negative', 'Positive',
       'Very positive', 'Very Positive', 'postive', 'positve',
       'Very Negative'], dtype=object)

In [168]:
SENTIMENT_CLASSES = 3

def collapse_sentiment_labels(labels):
    if SENTIMENT_CLASSES == 3:
        pos = ['positive', 'positve', 'postive', 'very positive']
        neg = ['negative', 'very negative']
        neu = ['neutral']
        
    elif SENTIMENT_CLASSES == 5:
        pos = ['positive', 'positve', 'postive']
        neg = ['negative']
        neu = ['neutral']
    
    new_labels = []

    for label in labels:
        label = label.lower()
        if label in pos:
            label = "positive"
        elif label in neg:
            label = "negative"
        elif label in neu:
            label = "neutral"
        new_labels.append(label)
    return pd.Series(new_labels)

def collapse_category_labels(labels):
    new_labels = []
    req = ['requirement', 'Requirement']
    oth = ['Other', 'other']
    bug = ['bug report', 'Bug report']
    noi = ['noise', 'Noise']
    for label in labels:
        label = label.lower()
        if label in req:
            new_labels.append('requirement')
        elif label in oth:
            new_labels.append('other')
        elif label in bug:
            new_labels.append('bug report')
        elif label in noi:
            new_labels.append('noise')
    return pd.Series(new_labels)

In [169]:
collapse_category_labels(good_req.category).unique()

array(['requirement', 'other', 'bug report', 'noise'], dtype=object)

In [174]:
len(good_req.sentiment)

2967

In [191]:
def check_invalid(labels):
    count = 0
    for l in labels:
        if type(l) != str or l == "NA":
            count += 1
    print(f"Found {count} NaN labels")
            
check_invalid(good_req.sentiment)
check_invalid(good_req.category)

Found 201 NaN labels
Found 0 NaN labels


In [192]:
good_req = good_req.dropna(subset=['sentiment'])
len(good_req.sentiment)

2591

In [180]:
2967 - 2792

175

In [185]:
good_req.sentiment = collapse_sentiment_labels(good_req.sentiment)

AttributeError: 'float' object has no attribute 'lower'

In [186]:
good_req.head(5)

Unnamed: 0,country,score,topic,review,category,sentiment
0,United States,1,Notifications not showing up,The notification badges are showing up on my i...,requirement,neutral
1,United States,1,Hate it!,Why do they make changes we don't need? Now th...,other,negative
2,Hong Kong,1,Useless function n poor experience,Story is useless n annoying to user. \nCan't s...,other,negative
3,United Kingdom,1,To many updates!,This app is always having an update for someth...,requirement,negative
4,United Kingdom,1,Photo albums,Just spent an hour trying to upload photos and...,requirement,negative


In [187]:
good_cul.head(5)

Unnamed: 0_level_0,country,score,topic,review,category,sentiment
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Singapore,5,Nice,It's helpful for all communication,other,positive
2,United States,3,Chronological Order,PLEASE GO BACK TO CHRONOLOGICAL ORDER ON THE T...,requirement,neutral
3,Australia,5,Great time saver,Highly recommend,other,positive
4,South Africa,5,Instagram App of the best I've used,"Works every time, all the time. Easy to use, p...",requirement,positive
5,Singapore,1,No response for multiple photo upload,"After many times I have tried, there still hav...",bug report,neutral


# MAKE IDENTICAL DATAFRAME STRUCTURE

In [194]:
good_req.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2591 entries, 0 to 2791
Data columns (total 6 columns):
country      2591 non-null object
score        2591 non-null int64
topic        2591 non-null object
review       2591 non-null object
category     2591 non-null object
sentiment    2591 non-null object
dtypes: int64(1), object(5)
memory usage: 141.7+ KB


In [189]:
good_cul.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2468 entries, 1 to 5
Data columns (total 6 columns):
country      2468 non-null object
score        2468 non-null int64
topic        2468 non-null object
review       2468 non-null object
category     2468 non-null object
sentiment    2468 non-null object
dtypes: int64(1), object(5)
memory usage: 135.0+ KB


In [195]:
frames = [good_cul, good_req]
result = pd.concat(frames)
len(result)

5059

In [196]:
result.head(10)

Unnamed: 0,country,score,topic,review,category,sentiment
1,Singapore,5,Nice,It's helpful for all communication,other,positive
2,United States,3,Chronological Order,PLEASE GO BACK TO CHRONOLOGICAL ORDER ON THE T...,requirement,neutral
3,Australia,5,Great time saver,Highly recommend,other,positive
4,South Africa,5,Instagram App of the best I've used,"Works every time, all the time. Easy to use, p...",requirement,positive
5,Singapore,1,No response for multiple photo upload,"After many times I have tried, there still hav...",bug report,neutral
6,United Kingdom,5,Fantastic - beats all other sat navs I have used,"Great, I use it daily and find it great for pl...",other,positive
7,Australia,4,#feedalgorithmsucks,"Love the filters, hate the non-chronological f...",requirement,neutral
8,Canada,1,It's not working,Every time I open the app it says couldn't ref...,bug report,neutral
9,United Kingdom,1,WTF!!!!,"You can't even call this an app, it's the bigg...",bug report,negative
10,United Kingdom,1,Newsfeed,Showing posts from weeks ago and can't get it ...,bug report,neutral


In [197]:
result.to_csv('labels_combined.csv', encoding='utf-8')