In [2]:
#importing necessary libraries
import re 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk

In [3]:
#load dataset
covid19_dataset=pd.read_csv("Train.csv")
covid19_dataset

Unnamed: 0,ID,text,target
0,train_0,The bitcoin halving is cancelled due to,1
1,train_1,MercyOfAllah In good times wrapped in its gran...,0
2,train_2,266 Days No Digital India No Murder of e learn...,1
3,train_3,India is likely to run out of the remaining RN...,1
4,train_4,In these tough times the best way to grow is t...,0
...,...,...,...
5282,train_6856,The spread of the novel among asylum seekers o...,1
5283,train_6857,Hundreds of Jewish patients are being treated...,1
5284,train_6858,Beats me Honestly most of the people I follow ...,0
5285,train_6859,Help us reach more people by donating and shar...,0


In [4]:
#shape of the dataset
covid19_dataset.shape

(5287, 3)

In [5]:
#check for missing values
covid19_dataset.isna().sum()

ID        0
text      0
target    0
dtype: int64

In [6]:
#check for duplicated values
covid19_dataset.duplicated().sum()

0

In [7]:
#check value_counts of each label values
covid19_dataset['target'].value_counts()

target
0    2746
1    2541
Name: count, dtype: int64

In [8]:
#text column
text_column='text'

In [9]:
#Extract the text data
text_data=covid19_dataset['text'].values.tolist()

In [10]:
text_data

['The bitcoin halving is cancelled due to',
 'MercyOfAllah In good times wrapped in its granular detail I challenge myself to find meaning and model the humility t',
 '266 Days No Digital India No Murder of e learning No 2g online business No Restore in J amp k',
 'India is likely to run out of the remaining RNA kits which are essential for testing in one week What is the gov',
 'In these tough times the best way to grow is to learn or in my case teach to help people learn to connect Sports and Anal',
 'FIFA has proposed allowing teams to make up to five substitutions per match to help players cope with the return to action',
 'Lovers of sports especially do you know why sometimes the time changes All this is done in the name of Daylight Saving Time DST which is the practice of setting the clocks forward one hour from standard time during the summer months and back again in the fall',
 'ig he kinda cute sometimes smh',
 'Frontline health workers are critical in the fight against infect

In [11]:
#Check language of text
from langdetect import detect

language_ids=[]

for text in text_data:
    try:
        lang_id=detect(text)
        language_ids.append(lang_id)
    except:
        language_ids.append('Unknown')
        
covid19_dataset['language_id']=language_ids

In [12]:
print(covid19_dataset['language_id'].value_counts())

language_id
en    5205
id      23
de      15
so       7
ca       4
tl       4
cy       4
no       3
it       3
fr       2
hr       2
da       2
pt       2
af       2
et       2
sv       2
es       1
sk       1
pl       1
sq       1
tr       1
Name: count, dtype: int64


In [13]:
covid19_dataset[covid19_dataset['language_id']  == 'so']

Unnamed: 0,ID,text,target,language_id
1150,train_1274,Ramadan Daily Dua Day 4,0,so
2581,train_3091,day 3 of ramadan guys SABAAH AL KHAIRRRRRRRRRRRR,0,so
3277,train_4020,Q how do you say the king had in yoruba Me Oba...,1,so
3918,train_4887,Day 4 Ramadan New Normal,0,so
4776,train_6110,Also Ramadan Mubarak to you too,0,so
4869,train_6247,If I talk day go say I talk and nah Ramadan,0,so
5020,train_6471,Ramadan Mubarak to all Stay home stay safe,0,so


In [14]:
#Word tokenization:separate words by whitespace
def split_text_by_whitespace(text_data):
    text_data_split=[]
    for text in text_data:
        words=text.split()#split text by whitespaces
        text_data_split.append(words)
    return text_data_split
        

In [18]:
text_data_split=split_text_by_whitespace(text_data)
covid19_dataset['text_data_split']=text_data_split

In [19]:
text_data

['The bitcoin halving is cancelled due to',
 'MercyOfAllah In good times wrapped in its granular detail I challenge myself to find meaning and model the humility t',
 '266 Days No Digital India No Murder of e learning No 2g online business No Restore in J amp k',
 'India is likely to run out of the remaining RNA kits which are essential for testing in one week What is the gov',
 'In these tough times the best way to grow is to learn or in my case teach to help people learn to connect Sports and Anal',
 'FIFA has proposed allowing teams to make up to five substitutions per match to help players cope with the return to action',
 'Lovers of sports especially do you know why sometimes the time changes All this is done in the name of Daylight Saving Time DST which is the practice of setting the clocks forward one hour from standard time during the summer months and back again in the fall',
 'ig he kinda cute sometimes smh',
 'Frontline health workers are critical in the fight against infect

In [20]:
#Implementing word tokeniation using BPE
import os
from subword_nmt.learn_bpe import learn_bpe
from subword_nmt.apply_bpe import BPE
import tempfile

#creating a temporary file and writes  each text_data entry into the temporary file 
with tempfile.NamedTemporaryFile(mode='w', delete=False, encoding='utf-8') as f:
    for entry in text_data:
        f.write(entry + '\n')
        tmp_text_filename=f.name
        
if os.path.exists(tmp_text_filename):
    print("Temporary file created successfully:", tmp_text_filename)
else:
    print("Error:")



Temporary file created successfully: C:\Users\hp\AppData\Local\Temp\tmpb9pquv94


In [21]:
learn_bpe(open(tmp_text_filename, 'r', encoding='utf-8'), open('bpe_model','w', encoding='utf-8'), num_symbols=10000)

#load BPE model
bpe=BPE(open('bpe_model','r',encoding='utf-8'))

#Tokenize text using BPE
tokenized_text=[]
with open(tmp_text_filename, 'r', encoding='utf-8') as f:
    for line_number, line  in enumerate(f, start=1):
        try:
            tokenized_line=bpe.process_line(line.strip())
            tokenized_text.append(tokenized_line) 
        except Exception as e:
            print(f"Error processing line {line_number}':{e}")     

#

100%|██████████| 10000/10000 [01:08<00:00, 146.96it/s]


In [22]:
print(f"Number of tokenized lines processed:{len(tokenized_text)}")

Number of tokenized lines processed:5287


In [23]:
covid19_dataset['tokenized_text']=tokenized_text

In [24]:
del covid19_dataset['text']

In [25]:
covid19_dataset

Unnamed: 0,ID,target,language_id,text_data_split,tokenized_text
0,train_0,1,en,"[The, bitcoin, halving, is, cancelled, due, to]",The bit@@ coin hal@@ ving is cancelled due to
1,train_1,0,en,"[MercyOfAllah, In, good, times, wrapped, in, i...",MercyOfAllah In good times wrapped in its gran...
2,train_2,1,en,"[266, Days, No, Digital, India, No, Murder, of...",266 Days No Digital India No Murder of e learn...
3,train_3,1,en,"[India, is, likely, to, run, out, of, the, rem...",India is likely to run out of the remaining R@...
4,train_4,0,en,"[In, these, tough, times, the, best, way, to, ...",In these tough times the best way to grow is t...
...,...,...,...,...,...
5282,train_6856,1,en,"[The, spread, of, the, novel, among, asylum, s...",The spread of the novel among asy@@ lum see@@ ...
5283,train_6857,1,en,"[Hundreds, of, Jewish, patients, are, being, t...",Hun@@ dreds of Jewish patients are being treat...
5284,train_6858,0,en,"[Beats, me, Honestly, most, of, the, people, I...",Be@@ ats me Hon@@ estly most of the people I f...
5285,train_6859,0,en,"[Help, us, reach, more, people, by, donating, ...",Help us reach more people by donating and shar...


In [26]:
#Implementing N-gram models for classification
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

#Feature Extraction
ngram_ranges=[(1,1),(1,2),(1,3)]
#Experiment with different N-gram models
for ngram_range in ngram_ranges:
    vectorzer=CountVectorizer(ngram_range=ngram_range)
    X=vectorzer.fit_transform(covid19_dataset['tokenized_text'])
    y=covid19_dataset['target']
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
    classifier=MultinomialNB()
    classifier.fit(X_train,y_train)
    
    #Model Evaluation
    y_pred=classifier.predict(X_test)
    accuracy=accuracy_score(y_test, y_pred)
    print(f"N-gram range:{ngram_range}, Accuracy:{accuracy:.2f}")
#

N-gram range:(1, 1), Accuracy:0.87
N-gram range:(1, 2), Accuracy:0.87
N-gram range:(1, 3), Accuracy:0.87


In [28]:
#Evaluate on the test_set
Test_database=pd.read_csv('Test.csv')
Test_database

Unnamed: 0,ID,text
0,test_2,Why is explained in the video take a look
1,test_3,Ed Davey fasting for Ramadan No contest
2,test_4,Is Doja Cat good or do you just miss Nicki Minaj
3,test_8,How Boris Johnson s cheery wounded in action p...
4,test_9,Man it s terrible Not even a reason to get on ...
...,...,...
1957,test_2932,Fageeru meehaa geyga Bandah PUBLIC fundS amp G...
1958,test_2934,DFFN Diffusion Pharmaceuticals Announces Pre I...
1959,test_2936,I want to wish the Muslim members of Congress ...
1960,test_2937,You mean you don t believe there is a conspira...


In [30]:
test_data=Test_database['text'].to_list()

In [43]:
#creating a temporary file and writes  each text_data entry into the temporary file 
with tempfile.NamedTemporaryFile(mode='w', delete=False, encoding='utf-8') as f:
    for entry in test_data:
        f.write(entry + '\n')
        test_text_filename=f.name
        
if os.path.exists(test_text_filename):
    print("Temporary file created successfully:", test_text_filename)
else:
    print("Error:")


Temporary file created successfully: C:\Users\hp\AppData\Local\Temp\tmp_nv3mtk5


In [44]:
#Tokenize text using BPE
tokenized_text=[]
with open(test_text_filename, 'r', encoding='utf-8') as f:
    for line_number, line  in enumerate(f, start=1):
        try:
            tokenized_line=bpe.process_line(line.strip())
            tokenized_text.append(tokenized_line) 
        except Exception as e:
            print(f"Error processing line {line_number}':{e}")     

#

In [45]:
Test_database['tokenized_text']=tokenized_text

In [46]:
X=vectorzer.transform(Test_database['tokenized_text'])
pred=classifier.predict(X)

In [47]:
Test_database['target']=pred
del Test_database['text']
del Test_database['tokenized_text']

KeyError: 'text'

In [40]:
Test_database

Unnamed: 0,ID,target
0,test_2,1
1,test_3,0
2,test_4,0
3,test_8,1
4,test_9,0
...,...,...
1957,test_2932,1
1958,test_2934,1
1959,test_2936,0
1960,test_2937,1


In [41]:
Test_database.to_csv('Submission.csv')

In [None]:
#Naives Bayes, Text Classification and Sentiment
