In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [5]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl (297.1 MB)
[K     |████████████████████████████████| 297.1 MB 30 kB/s s eta 0:00:01    |████████▎                       | 76.8 MB 14.3 MB/s eta 0:00:16     |██████████████████████▉         | 212.4 MB 7.6 MB/s eta 0:00:12
Collecting numpy
  Downloading numpy-1.22.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB)
[K     |████████████████████████████████| 16.8 MB 16.3 MB/s eta 0:00:01
[?25hInstalling collected packages: numpy, xgboost
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
daal4py 2021.5.0 requires daal==2021.4.0, which is not installed.
tensorflow 2.15.0.post1 requires numpy<2.0.0,>=1.23.5, but you h

In [4]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from wordcloud import WordCloud

import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, confusion_matrix



ModuleNotFoundError: No module named 'xgboost'

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('./nlp-getting-started/train.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.head()

In [None]:
df.drop(columns=['id', 'keyword', 'location'],inplace=True)

## Data Processing

In [None]:
df.head()

In [None]:
def clean_text(text):
    
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters, quotes, numbers, and other non-text characters
    text = re.sub(r'\s+', ' ', text) # Replace multiple whitespace characters with a single space   
    text = text.strip()  # Remove leading and trailing whitespace
    
    # removing stopword from text
    stopword = stopwords.words('english')
    text = [word for word in text.split() if word not in stopword ]
    
    # stemming text get word in it's root form
    sb_stem = SnowballStemmer('english')
    text = ' '.join([sb_stem.stem(word) for word in text ])
    
    return text


In [None]:
df['text'] = df['text'].apply(clean_text)

In [None]:
df['text_length'] = df['text'].apply(len)

# EDA

In [None]:
df.sample()

In [None]:
target = df['target'].value_counts()

In [None]:
sns.barplot(x=target.index,y=target.values)

In [None]:
sns.boxenplot(data=df,x='target',y='text_length')    

In [None]:
px.histogram(data_frame=df, x='text_length',nbins=50,color='target', width=800, height=400)

In [None]:
avg_len = df.groupby('target')['text_length'].mean()

In [None]:
px.bar(x=avg_len.index,y= avg_len.values, width=500, height=400,color=avg_len.index)

In [None]:
plt.figure(figsize=(15,10))
plot_num = 1
for i in df['target'].unique():
    plt.subplot(3,2,plot_num)
    wordcloud = WordCloud(height=400,width=800,background_color='white').generate(' '.join(df[df['target']==i]['text']))
    plt.title(f'word cloud for {i}')
    plt.imshow(wordcloud)
    plt.tight_layout()
    plot_num += 1
plt.show()

In [None]:
df.head()

## Vectorize

In [None]:
X = df['text']
y = df['target']

In [None]:
tfid_vect = TfidfVectorizer(max_features=5000)

In [None]:
X_vectorize = tfid_vect.fit_transform(X)

In [None]:
tfid_vect.get_feature_names_out()

## Create Train and Validation Data

In [None]:
X_train,X_val,y_train,y_val = train_test_split(X_vectorize,y,test_size=0.2,random_state=42)

In [None]:
X_train.shape

In [None]:
X_val.shape

## Training Model

In [None]:
classifier = {
             'Logistic Regression' : LogisticRegression(),
             'SVC' : SVC(),
             'BernoulliNB' : BernoulliNB(),
             'MultinomialNB': MultinomialNB(),
             'RandomForestClassifier' : RandomForestClassifier(),
             'XGBClassifier' : XGBClassifier()
             }

In [None]:
for name, model in classifier.items():
    model = model
    model.fit(X_train,y_train)
    y_predict = model.predict(X_val)
    f1score = f1_score(y_val,y_predict)
    confusion_mat = confusion_matrix(y_val,y_predict)
    print('classifier : ', name)
    print('f1-score   : ',f1score)
    print('confusion matrix : \n', confusion_mat)
    fig = px.imshow(confusion_mat,text_auto=True)
    fig.update_layout(height=300, width=400, xaxis_visible=False, yaxis_visible=False)
    fig.show()
    print("="*50)

#### From the above, the BernoulliNB model has the best F1 score.

In [None]:
bernoulli = BernoulliNB()

In [None]:
bernoulli.fit(X_train,y_train)

In [None]:
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [None]:
X_test = tfid_vect.transform(test_df['text'])

In [None]:
test_df['target'] = bernoulli.predict(X_test)

In [None]:
test_df[['id','target']].to_csv('submission.csv',index=False)