### TANZANIA INDABAX VIRTUAL HACKATHON 2023: TELECOM SENTIMENT ANALYSIS CHALLENGE

## Let's Get Started 

In [1]:
# import important modules
import numpy as np
import pandas as pd

# sklearn modules
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier #classifier

from sklearn.metrics import accuracy_score #evaluation metric
from sklearn.feature_extraction.text import CountVectorizer

# text preprocessing modules
import re 
from string import punctuation 

import warnings
warnings.filterwarnings("ignore")
# seeding
np.random.seed(123)

In [2]:
# load data
path = ''
train = pd.read_csv(path+"train.csv")
test = pd.read_csv(path+"test.csv")
submission = pd.read_csv(path+"sample_submission.csv")

In [3]:
# show top five rows of train data
train.head() 

Unnamed: 0,train_id,comment,sentiment
0,bbe0f441-b2fc-461a-9618-e3a7eb9cb6b7,Habari @telecom1_TZ naomba kujua kama kuna hud...,-1
1,81b3b16d-8bf0-4d46-9b68-d7ffb078d108,@telecom3_tanzania Hivi telecom3 mnatuchukulia...,-1
2,f4785623-6bb9-49de-b905-25911a66f1d3,@telecom2 wamupunguza vifurushi vya internet k...,0
3,0ec5ec2b-0811-4b1d-be70-9cc0c137159a,@telecom2 Kwenye huduma1 nimewakubali sana sem...,-1
4,40082169-ebd0-4ca9-b0d3-c46ac949e034,@IdrisSultan vip kuhusu hawa mzee.\n Umefiki...,0


In [4]:
# show top five rows of test data
test.head()

Unnamed: 0,test_id,comment
0,53bb8dff-4a07-4aac-a6f2-ca552389652c,huduma1 wamenitext et wameniona mataa😣
1,a01e1c68-2f77-4073-87c5-d7e5d75c8a79,@telecom1_TZ @telecom3Tanzania Na mtuue sasa
2,9a5e4b98-ac29-4138-bc28-fc878d5d8269,Hakuna mtandao wa simu unaoongoza kutuma messa...
3,9ef55f06-7748-4e02-85c7-fa3e46bb02dd,@Mwanzalimaa Itakuwa kaipenda no yako kaenda k...
4,d93b4084-3000-4026-bae0-ecb65564dbbd,@telecom1_TZ Mm Ni wakala wa usajili wa lain c...


In [5]:
# show top five rows of submision file
submission.head()

Unnamed: 0,test_id,sentiment
0,53bb8dff-4a07-4aac-a6f2-ca552389652c,-1.0
1,a01e1c68-2f77-4073-87c5-d7e5d75c8a79,0.0
2,9a5e4b98-ac29-4138-bc28-fc878d5d8269,1.0
3,9ef55f06-7748-4e02-85c7-fa3e46bb02dd,
4,d93b4084-3000-4026-bae0-ecb65564dbbd,


In [6]:
# check the shape of the train data
train.shape

(3123, 3)

In [7]:
# check the shape of the test data
test.shape

(1041, 2)

In [8]:
# check missing values in train data
train.isnull().sum()

train_id     0
comment      0
sentiment    0
dtype: int64

In [9]:
# check missing values in test data
test.isnull().sum()

test_id    0
comment    0
dtype: int64

In [10]:
# evalute Labels distribution
train.sentiment.value_counts()

-1    1522
 0    1177
 1     424
Name: sentiment, dtype: int64

### Data Preparation 

In [11]:
# a simple function to clean text data 

def text_cleaning(text):
    # Clean the text data

    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text) # remove numbers
    text = text.lower()  # set in lowercase 
        
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])
        
    # Return a list of words
    return(text)

In [12]:
#clean the train and test data
train["comment"] = train["comment"].apply(text_cleaning)
test["comment"] = test["comment"].apply(text_cleaning)

In [13]:
#split features and target from train data 
X = train["comment"]
y = train.sentiment.values

In [14]:
# Transform text data 
vectorizer = CountVectorizer(lowercase=False)

vectorizer.fit(X)

#transform train data 
X_transformed = vectorizer.transform(X)

#transform test data
test_transformed = vectorizer.transform(test["comment"])

In [15]:
# split data into train and validate

X_train, X_valid, y_train, y_valid = train_test_split(
    X_transformed,
    y,
    test_size=0.10,
    random_state=42,
    shuffle=True,
    stratify=y,
)

### Create Classifier 

In [16]:
# Create a classifier
tweets_classifier = RandomForestClassifier() 

In [17]:
# train the tweets_classifier 
tweets_classifier.fit(X_train,y_train)

In [18]:
# test model performance on valid data 
y_preds = tweets_classifier.predict(X_valid)

In [19]:
# evalute model performance by using accuracy_score in the validation data
accuracy_score(y_valid, y_preds) 

0.5686900958466453

In [20]:
# create prediction from the test data
test_preds = tweets_classifier.predict(test_transformed)

### Create Submission File

In [21]:
# create submission file 
submission["sentiment"] = test_preds

In [22]:
# show sample submissoin rows
submission.head() 

Unnamed: 0,test_id,sentiment
0,53bb8dff-4a07-4aac-a6f2-ca552389652c,0
1,a01e1c68-2f77-4073-87c5-d7e5d75c8a79,-1
2,9a5e4b98-ac29-4138-bc28-fc878d5d8269,-1
3,9ef55f06-7748-4e02-85c7-fa3e46bb02dd,0
4,d93b4084-3000-4026-bae0-ecb65564dbbd,-1


In [23]:
# save submission file 
submission.to_csv(path+"first_submission.csv",index=False) 

Now upload your first submssion file on the hackathon page 👍