In [1]:
# import libraries
from src.preprocess.preprocess import *
from src.data_operations.rw_utils import read_from_csv, write_to_csv, read_from_excel
from src.data_operations.data_quality import create_data_quality_report
from src.data_operations.plots import plot_bar_chart, plot_wordcloud
import time
import numpy as np


# tweets data set operations

In [2]:
# read data from source
train = read_from_excel("train_tweets.xlsx", ["tweet", "sentiment"])
train.head()

Data is read. Len of the data 13832 and columns Index(['tweet', 'sentiment'], dtype='object')


Unnamed: 0,tweet,sentiment
0,Ulan Wifi'ye bağlıyım ben. Ona bağlıyken Turkc...,olumsuz
1,20 dk 1 GB internet 500 mb sadece kaşar turkce...,olumsuz
2,Ayrıca turkcell superonline reklamı kadar da k...,olumsuz
3,Turkcell çok pahalı ya,olumsuz
4,Turkcell Kaş'ta internetin cekmiyor,olumsuz


In [3]:
test = read_from_excel("test_tweets.xlsx", ["tweet", "sentiment"])
test.head()

Data is read. Len of the data 3457 and columns Index(['tweet', 'sentiment'], dtype='object')


Unnamed: 0,tweet,sentiment
0,Turkcell'e kızgınım. Ve bu kızgınlık sanırım a...,olumsuz
1,turkcell kadar şerefsiz misiniz ya,olumsuz
2,Burdan Turkcell'e sesleniyorum o 3 tl haram olsun,olumsuz
3,Hayatımda turkcell kadar kazık 1 operatör görm...,olumsuz
4,Turkcell gözümde son demlerini yaşıyor hattı d...,olumsuz


In [4]:
# get each label counts
df_label_counts = train["sentiment"].value_counts()
df_label_counts

olumsuz    5511
notr       4658
olumlu     3663
Name: sentiment, dtype: int64

In [5]:
# visualize the sample counts of each label
labels = list(df_label_counts.to_frame().index)
counts = df_label_counts.values
plot_bar_chart(labels, counts, "Sentiments - Train").show()

In [6]:
## Data Information
missing_value_df = create_data_quality_report(train, "sentiment")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13832 entries, 0 to 13831
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet      13832 non-null  object
 1   sentiment  13832 non-null  object
dtypes: object(2)
memory usage: 216.2+ KB
#################################################
############ DATA QUALITY RESULT ################
#################################################

Number of sample in data set:13832.
Number of classes in data set: 3 and they are: ['olumsuz' 'olumlu' 'notr'].
Columns in data set:['tweet', 'sentiment'].

None.



############## SUMMARY STATISTICS ###############

tweet sentiment
count                                               13832     13832
unique                                              13802         3
top     Ensar'a sponsor olan Turkcell'in yıllık kârı y...   olumsuz
freq                                                    2      5511.



############## NULL PERCENTAGES ##

In [7]:
missing_value_df

Unnamed: 0,column_name,percent_missing
tweet,tweet,0.0
sentiment,sentiment,0.0


In [8]:
# drop null values and duplicate values
train['tweet'].replace('', np.nan, inplace=True)
train = train.dropna()
train = train.drop_duplicates()
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13807 entries, 0 to 13831
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet      13807 non-null  object
 1   sentiment  13807 non-null  object
dtypes: object(2)
memory usage: 323.6+ KB


In [9]:
# apply preprocess operations
preprocess_operations = [to_lower, remove_stopwords, handle_emojis, remove_hyperlink, remove_number, remove_punctuation, 
                         remove_whitespace,
                         replace_special_chars, remove_less_than_two]

In [10]:
# example result
train["tweet"].values[12]

'Of evin her köşesinden Turkcell geçiyor bizi kazıklamalarıyla kalmayıp birde internetleri düşük hızlı'

In [11]:
apply_preprocess_operations_to_corpus([train["tweet"].values[12]], preprocess_operations)

['evin köşesinden geçiyor bizi kazıklamalarıyla kalmayıp birde internetleri düşük hızlı']

In [12]:
# apply preprocess operation train
start = time.time()
for operation in preprocess_operations:
    train["tweet"] = train["tweet"].apply(operation)
print(f"Processed {len(train)} samples.\n")
print(f"It's took {(time.time()-start) / 60} minutes.")

Processed 13807 samples.

It's took 0.018102538585662842 minutes.


In [13]:
# apply preprocess operation all the data
start = time.time()
for operation in preprocess_operations:
    test["tweet"] = test["tweet"].apply(operation)
print(f"Processed {len(test)} samples.\n")
print(f"It's took {(time.time()-start) / 60} minutes.")

Processed 3457 samples.

It's took 0.003445597489674886 minutes.


In [14]:
write_to_csv("preprocess_train.csv", train)
write_to_csv("preprocess_test.csv", test)

Data is wrote to path C:\Users\user\Desktop\YL\1.2\hesaplamalı_anabilim\ödev2\20501001\20501001\sentiment_analysis/data/, with name preprocess_train.csv
Data is wrote to path C:\Users\user\Desktop\YL\1.2\hesaplamalı_anabilim\ödev2\20501001\20501001\sentiment_analysis/data/, with name preprocess_test.csv


In [15]:
train["tweet"].values

array(['ulan wifiye bağlıyım ben ona bağlıyken internet paketin bitti mesaj atabilir bana onu ödeyelim',
       'internet sadece kaşar düşer çocukları',
       'ayrıca reklamı kadar kötü bir reklam görmemiştim', ...,
       'merhaba numarami bir baska operatöre taşıdım hattimda kalan bakiyeyinin iadesini nasil yapıyorsunuz',
       'iyi herkes abonesi değil',
       'çekmiyor çekmiyor kaç para ulan fakir telefonu avea parayı bulayım ilk işim geçmek'],
      dtype=object)

In [16]:
#plot_wordcloud(train["tweet"].values, title="Sentiment Analysis Word Cloud")

In [17]:
#plot_wordcloud(train.where(train.sentiment == "olumlu").dropna()["tweet"].values, title="Sentiment Analysis Word Cloud Olumlu")

In [18]:
#plot_wordcloud(train.where(train.sentiment == "olumsuz").dropna()["tweet"].values, title="Sentiment Analysis Word Cloud Olumsuz")

In [19]:
#plot_wordcloud(train.where(train.sentiment == "notr").dropna()["tweet"].values, title="Sentiment Analysis Word Cloud Notr")