In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../swahili.csv")

### Explore the dataset:
Perform basic exploratory data analysis (EDA) to understand the structure and content of the dataset.

In [3]:
# View the first few rows of the dataset
print(df.head())

   Unnamed: 0                                text    labels
0           0  team 2019merimera alikuwa takataka  negative
1           1                      sijafurahishwa  negative
2           2                       kubuni dosari  negative
3           3   bila kusema nilipoteza pesa zangu  negative
4           4        sema kupoteza pesa na wakati  negative


In [4]:
# Check the shape of the dataset
print(df.shape)

(3925, 3)


In [5]:
# Check the column names
print(df.columns)

Index(['Unnamed: 0', 'text', 'labels'], dtype='object')


In [6]:
# Check the data types of each column
print(df.dtypes)

Unnamed: 0     int64
text          object
labels        object
dtype: object


In [7]:
# drop the first column [unnamed]
df.drop(df.columns[0], axis=1, inplace=True)
df.head()

Unnamed: 0,text,labels
0,team 2019merimera alikuwa takataka,negative
1,sijafurahishwa,negative
2,kubuni dosari,negative
3,bila kusema nilipoteza pesa zangu,negative
4,sema kupoteza pesa na wakati,negative


## Handle missing values: 
* Check for missing values in the dataset and decide on an appropriate strategy to handle them. 
* Missing values can be dropped.

In [8]:
# Check for missing values
print(df.isnull().sum())

text      0
labels    0
dtype: int64


In [10]:
# Drop rows with missing values
# df = df.dropna()  # dataset is clean,No missing values

# Fill missing values using mean imputation
# df['column_name'].fillna(df['column_name'].mean(), inplace=True) # dataset is clean,No missing values

## Clean the text data: 
Apply text cleaning techniques to preprocess the text data. <br>
This may involve steps like removing punctuation, converting text to lowercase, removing stop words, and handling special characters.

In [11]:

import re

# Remove punctuation
df['text'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Convert text to lowercase
df['text'] = df['text'].str.lower()

# Remove numbers
df['text'] = df['text'].apply(lambda x: re.sub(r'\d+', '', x))


In [12]:
# If Necessary convert labels to 1 and 0 for positive and negative respectively
label_column = "labels"
df[label_column] = df[label_column].replace({'positive': 1, 'negative': 0})

In [13]:
# preview the dataset
df.head()

Unnamed: 0,text,labels
0,team merimera alikuwa takataka,0
1,sijafurahishwa,0
2,kubuni dosari,0
3,bila kusema nilipoteza pesa zangu,0
4,sema kupoteza pesa na wakati,0


In [14]:
df.tail()

Unnamed: 0,text,labels
3920,nafikiri chakula chapasa kuwa na ladha na umbi...,0
3921,hamu ya kula ilitoweka mara moja,0
3922,kwa ujumla sikuvutiwa na nisirudi nyuma,0
3923,mambo yote yaliyoonwa yalikuwa chini ya kiwang...,0
3924,basi ni kana kwamba nilipoteza maisha yangu ya...,0


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3925 entries, 0 to 3924
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    3925 non-null   object
 1   labels  3925 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 61.5+ KB


In [16]:
df[label_column].value_counts()

1    2002
0    1923
Name: labels, dtype: int64

In [17]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
labels,3925.0,0.510064,0.499962,0.0,0.0,1.0,1.0,1.0


In [18]:
df.labels.unique()

array([0, 1], dtype=int64)

In [None]:
df.to_csv("clean_sentiment.csv")