In [1]:
import pandas as pd

sampl_path = r"\quora-insincere-questions-classification\sample_submission.csv"
test_path = r"quora-insincere-questions-classification\test.csv"
train_path = r"quora-insincere-questions-classification\train.csv"

test_df = pd.read_csv(test_path)
train_df = pd.read_csv(train_path)
train_df.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [22]:
test_df = test_df[:100]

test_df.shape

(50000, 2)

In [2]:
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import word_tokenize
# nltk.download('stopwords')


In [3]:
english_stop_words = stopwords.words('english')
stemmer = SnowballStemmer(language='english')


In [4]:
train_df.duplicated().sum()

0

In [23]:
df = train_df[['question_text', 'target']]
df = df.sample(100000,random_state=42)

In [6]:
df.head()

Unnamed: 0,question_text,target
443046,What is the most effective classroom managemen...,0
947549,Can I study abroad after 10th class from Bangl...,0
523769,How can I make friends as a college junior?,0
949821,How do I download free APK Minecraft: Pocket E...,0
1030397,"Like Kuvera, is ""Groww"" also a free online inv...",0


In [7]:
def preprocess_text(text):
    tokens = word_tokenize(text)
    stemmer = SnowballStemmer("english")
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [token for token in stemmed_tokens if token.lower() not in stop_words]
    processed_text = " ".join(filtered_tokens)
    
    return processed_text


In [8]:
process_df = [preprocess_text(i) for i in df.question_text]
process_df

['effect classroom manag skill/techniqu creat good learn environ ?',
 'studi abroad 10th class bangladesh ?',
 'make friend colleg junior ?',
 'download free apk minecraft : pocket edit io ( iphon ) ?',
 "like kuvera , `` groww '' also free onlin invest platform invest direct mutual fund ?",
 'precis definit record academ read term ?',
 "leonida 's role battl thermopyla ?",
 'get univers germani toefl ielt ?',
 "'s remaind 30^33/3 ?",
 'mindset learn new skill anyth ?',
 "old english `` pure english '' ?",
 'militari capit punish ?',
 'way earn money use brand new car except attach uber/ola ?',
 'whi ravindra jadeja alway prefer player india play 11 ?',
 'aldo leopold ?',
 'need purchas insur educ technolog compani ?',
 'come india emerg economi ?',
 'pursu bba mba ?',
 'entertain activ place cairo ?',
 "citizen rose ' docuseri featur rose mcgowan , air intern ?",
 'colleg applic , combin cross countri , indoor track , outdoor track one activ ?',
 'changes/addit harri potter movi actua

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=10000)

In [10]:
vectorizer.fit(process_df)

In [11]:
vectorizer.get_feature_names_out()[:10]

array(['00', '000', '01', '03', '04', '05', '06', '10', '100', '1000'],
      dtype=object)

In [12]:
vectorizer.vocabulary_

{'effect': 3027,
 'classroom': 1882,
 'manag': 5513,
 'skill': 8208,
 'techniqu': 8867,
 'creat': 2271,
 'good': 3868,
 'learn': 5194,
 'environ': 3174,
 'studi': 8607,
 'abroad': 343,
 '10th': 19,
 'class': 1876,
 'bangladesh': 1042,
 'make': 5494,
 'friend': 3685,
 'colleg': 1979,
 'junior': 4968,
 'download': 2858,
 'free': 3664,
 'apk': 716,
 'minecraft': 5809,
 'pocket': 6856,
 'edit': 3016,
 'io': 4761,
 'iphon': 4773,
 'like': 5285,
 'also': 574,
 'onlin': 6363,
 'invest': 4752,
 'platform': 6822,
 'direct': 2687,
 'mutual': 6009,
 'fund': 3708,
 'precis': 6980,
 'definit': 2512,
 'record': 7408,
 'academ': 358,
 'read': 7363,
 'term': 8908,
 'role': 7676,
 'battl': 1090,
 'get': 3808,
 'univers': 9365,
 'germani': 3805,
 'toefl': 9052,
 'ielt': 4464,
 'remaind': 7492,
 '30': 166,
 '33': 174,
 'mindset': 5807,
 'new': 6136,
 'anyth': 705,
 'old': 6343,
 'english': 3145,
 'pure': 7203,
 'militari': 5791,
 'capit': 1602,
 'punish': 7193,
 'way': 9716,
 'earn': 2976,
 'money': 5897

In [13]:
len(vectorizer.vocabulary_)

10000

In [16]:
x = vectorizer.transform(process_df)

In [18]:
count_matrix = x.toarray()
feature_names = vectorizer.get_feature_names_out()


## Now let's trasnform test data into vector

In [24]:
process_test = [preprocess_text(i) for i in test_df.question_text]
process_test

['speed electron observ ?',
 'donald trump meet leader north korea , refer littl rocket man ?',
 'get full fund scholarship educ ani european countri ?',
 '80 apex frozen food share . hold sell ?',
 'differ null valu zero dbms ?',
 'kylo ren poor popularis nietzsch ?',
 'best certif sql medic code ?',
 'would win fight , spongebob steven univers ?',
 'best way clean fmj model 45/410 derring ?',
 'crazi talk ?',
 'program languag algorithm differ without know program languag ani one learn algorithm ?',
 'resid uae iphon purchas uae u use facetim ?',
 'relat period sex women ?',
 'start convers strang girl ?',
 'ghost listen human convers speak direct human ?',
 'way go deaf without hurt ?',
 'deal unansw question ?',
 'past fix chang , futur fluid , doe mean futur arriv make happen ; wait us chang ?',
 'prepar hay bale cake ?',
 'whi like code ?',
 'much doe cost nba team belong leagu ?',
 'u. . naval academi establish ring danc program ?',
 'sale good state composit ?',
 'better : stud

In [25]:
trans_test = vectorizer.transform(process_test)
trans_test.toarray().shape

(50000, 10000)

In [26]:
from sklearn.model_selection import train_test_split
train_inputs, val_inputs, train_targat, val_target =train_test_split(x, df.target,
                                                                     test_size = 0.3, random_state=55)

In [27]:
print('Training inputs shape:', train_inputs.shape)
print('Validation inputs shape:', val_inputs.shape)
print('Training target shape:', train_targat.shape)
print('Validation target shape:', val_target.shape)

Training inputs shape: (70000, 10000)
Validation inputs shape: (30000, 10000)
Training target shape: (70000,)
Validation target shape: (30000,)


In [58]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=10000, solver='sag')
lr.fit(train_inputs, train_targat)

In [59]:
train_pred = lr.predict(train_inputs)
train_pred

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [60]:
pd.Series(train_pred).value_counts()

0    67405
1     2595
Name: count, dtype: int64

In [61]:
from sklearn.metrics import accuracy_score
accuracy_score(train_targat, train_pred)

0.9631142857142857

In [62]:
import numpy as np
accuracy_score(train_targat,np.zeros(len(train_targat)))

0.9391

## now let's check with the test data set

In [63]:
val_pred = lr.predict(val_inputs)
val_pred

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [64]:
pd.Series(val_pred).value_counts()

0    28928
1     1072
Name: count, dtype: int64

In [65]:
from sklearn.metrics import accuracy_score
accuracy_score(val_target, val_pred)

0.9514333333333334

In [66]:
from sklearn.metrics import f1_score
f1_score(train_targat,train_pred)

0.6235053951589384

In [67]:
f1_score(val_target,val_pred)

0.48823322795925533

In [88]:
sincere_q1 = df[df.target==0].question_text[2:5]
insincere_q1 = df[df.target==1].question_text[2:5]

In [89]:
sincere_q1

523769           How can I make friends as a college junior?
949821     How do I download free APK Minecraft: Pocket E...
1030397    Like Kuvera, is "Groww" also a free online inv...
Name: question_text, dtype: object

In [90]:
insincere_q1

423255    This is a question for Muslims? Because Americ...
30882     Could the numerous acts of arms being severed ...
908188    Should plus-size women wear dresses and skirts...
Name: question_text, dtype: object

In [91]:
insincere_q1_process = [preprocess_text(i) for i in insincere_q1]
insincere_q1_process

['question muslim ? becaus american women dress veri open , deserv get rape happen . 3encourag .i tri sick funni . want honest answer ?',
 'could numer act arm sever star war franchis metaphor scientif procedur castrat ?',
 'plus-siz women wear dress skirt onli ?']

In [92]:
sincere_q1_process = [preprocess_text(i) for i in sincere_q1]
sincere_q1_process

['make friend colleg junior ?',
 'download free apk minecraft : pocket edit io ( iphon ) ?',
 "like kuvera , `` groww '' also free onlin invest platform invest direct mutual fund ?"]

In [93]:
sin_vec = vectorizer.transform(sincere_q1_process)
insin_vec = vectorizer.transform(insincere_q1_process)

In [94]:
sin_vec.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [95]:
insin_vec.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [96]:
lr.predict(sin_vec)

array([0, 0, 0], dtype=int64)

In [97]:
lr.predict(insin_vec)

array([1, 1, 0], dtype=int64)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import pandas as pd

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    tokens = word_tokenize(text)
    stemmer = SnowballStemmer("english")
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [token for token in stemmed_tokens if token.lower() not in stop_words]
    processed_text = " ".join(filtered_tokens)
    
    return processed_text

def apply_countvectorizer(dataset):
    preprocessed_documents = [preprocess_text(document) for document in dataset]
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(preprocessed_documents)
    count_matrix = X.toarray()
    feature_names = vectorizer.get_feature_names_out()
    df = pd.DataFrame(count_matrix, columns=feature_names)
    
    return df

documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

result_df = apply_countvectorizer(documents)
print(result_df)


In [27]:
documents = [
    "This is the first document. and i love to read this document, this is the excellent document of my life.",
    "This document is my second document.and i also love to read this document, this document is excellent too of my life.",
    "And this is the third one documents.",
    "okeay,..Is this the first document?",
]

In [45]:
process = [preprocess_text(i) for i in documents]
process

['first document . love read document , excel document life .',
 'document second document.and also love read document , document excel life .',
 'third one document .',
 'okeay , .. first document ?']

In [48]:
doc_vec = CountVectorizer(max_features=4)

In [51]:
doc_vec.fit(process)
doc_vec.get_feature_names_out()

array(['document', 'excel', 'first', 'life'], dtype=object)

In [52]:
doc_vec.vocabulary_


{'first': 2, 'document': 0, 'excel': 1, 'life': 3}

In [53]:
inputs = doc_vec.transform(process)

In [54]:
inputs.shape

(4, 4)

In [55]:
inputs.toarray().shape

(4, 4)

In [56]:
count_matrix = inputs.toarray()

# Get the feature names (words in the vocabulary)
feature_names = doc_vec.get_feature_names_out()

# Create a DataFrame for better visualization
df = pd.DataFrame(count_matrix, columns=feature_names)
df

Unnamed: 0,document,excel,first,life
0,3,1,1,1
1,4,1,0,1
2,1,0,0,0
3,1,0,1,0
