In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
from collections import Counter

In [3]:
def read_docu(file):

    all_words = []

    with open(file, "r", encoding = "utf-8") as input_file:
        for line in input_file:
            line = line.lower()
            line = line.strip().split()
            all_words += line
        return(all_words)

def word_counter(all_words):

    word_count = Counter()
    for word in all_words:
        word_count[word] += 1
    return(word_count.values())

In [4]:
df= pd.read_csv('train_snli.txt', sep='\t')
df.head()

Unnamed: 0,A person on a horse jumps over a broken down airplane.,"A person is at a diner, ordering an omelette.",0
0,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",1.0
1,Children smiling and waving at camera,There are children present,1.0
2,Children smiling and waving at camera,The kids are frowning,0.0
3,A boy is jumping on skateboard in the middle o...,The boy skates down the sidewalk.,0.0
4,A boy is jumping on skateboard in the middle o...,The boy does a skateboarding trick.,1.0


In [5]:
df.isnull().sum()

A person on a horse jumps over a broken down airplane.    0
A person is at a diner, ordering an omelette.             0
0                                                         1
dtype: int64

In [6]:
#text
text = "Two blond women are hugging one another"

# length of text ( includes spaces)
print("length of text: ",len(text))

# split the text
splitted_text = text.split() # default split methods splits text according to spaces
print("Splitted text: ",splitted_text)    # splitted_text is a list that includes words of text sentence
# each word is called token in text maning world.

length of text:  39
Splitted text:  ['Two', 'blond', 'women', 'are', 'hugging', 'one', 'another']


In [7]:
# find specific words with list comprehension method
specific_words = [word for word in splitted_text if(len(word)>2)]
print("Words which are more than 3 letter: ",specific_words)

# capitalized words with istitle() method that finds capitalized words
capital_words = [ word for word in splitted_text if word.istitle()]
print("Capitalized words: ",capital_words)

# words which end with "o": endswith() method finds last letter of word
words_end_with_o =  [word for word in splitted_text if word.endswith("o")]
print("words end with o: ",words_end_with_o)

# words which starts with "w": startswith() method
words_start_with_w = [word for word in splitted_text if word.startswith("w")]
print("words start with w: ",words_start_with_w)

Words which are more than 3 letter:  ['Two', 'blond', 'women', 'are', 'hugging', 'one', 'another']
Capitalized words:  ['Two']
words end with o:  ['Two']
words start with w:  ['women']


In [8]:
# unique with set() method
print("unique words: ",set(splitted_text))  # actually the word "no" is occured twice bc one word is "no" and others "No" there is a capital letter at first letter

# make all letters lowercase with lower() method
lowercase_text = [word.lower() for word in splitted_text]

# then find uniques again with set() method
print("unique words: ",set(lowercase_text))

unique words:  {'hugging', 'one', 'Two', 'another', 'blond', 'are', 'women'}
unique words:  {'hugging', 'one', 'another', 'blond', 'two', 'are', 'women'}


In [9]:
# check words includes or not includes particular substring or letter
print("Is w letter in women word:", "w" in "women")

# check words are upper case or lower case
print("Is word uppercase:", "WOMEN".isupper())
print("Is word lowercase:", "hugging".islower())

# check words are made of by digits or not
print("Is word made of by digits: ","12345".isdigit())

# get rid of from white space characters like spaces and tabs or from unwanted letters with strip() method
print("00000000Two blond: ","00000000Two blond".strip("0"))

# find particular letter from front
print("Find particular letter from back: ","one another".find("r"))  # at index 1

# find particular letter from back  rfind = reverse find
print("Find particular letter from back: ","one another".rfind("r"))  # at index 8

# replace letter with number
print("Replace o with 4 ", "one another".replace("r","4"))

# find each letter and store them in list
print("Each letter: ",list("Two blond"))

Is w letter in women word: True
Is word uppercase: True
Is word lowercase: True
Is word made of by digits:  True
00000000Two blond:  Two blond
Find particular letter from back:  10
Find particular letter from back:  10
Replace o with 4  one anothe4
Each letter:  ['T', 'w', 'o', ' ', 'b', 'l', 'o', 'n', 'd']


In [10]:
# Cleaning text
text1 = "    The kids are frowning    "
print("Split text: ",text1.split(" "))   # as you can see there are unnecessary white space in list

# get rid of from these unnecassary white spaces with strip() method then split
print("Cleaned text: ",text1.strip().split(" "))

Split text:  ['', '', '', '', 'The', 'kids', 'are', 'frowning', '', '', '', '']
Cleaned text:  ['The', 'kids', 'are', 'frowning']


In [11]:
# reading files line by line
f = open("train_snli.txt","r")

# read first line
print(f.readline())

# length of text
text3=f.read()
print("Length of text: ",len(text3))

# Number of lines with splitlines() method
lines = text3.splitlines()
print("Number of lines: ",len(lines))

A person on a horse jumps over a broken down airplane.	A person is at a diner, ordering an omelette.	0

Length of text:  2097049
Number of lines:  19813


In [12]:
df = df.rename(columns={'A person on a horse jumps over a broken down airplane.':'person', 'A person is at a diner, ordering an omelette.': 'omelette'})

In [13]:
df.head()

Unnamed: 0,person,omelette,0
0,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",1.0
1,Children smiling and waving at camera,There are children present,1.0
2,Children smiling and waving at camera,The kids are frowning,0.0
3,A boy is jumping on skateboard in the middle o...,The boy skates down the sidewalk.,0.0
4,A boy is jumping on skateboard in the middle o...,The boy does a skateboarding trick.,1.0


In [14]:
print("In this text, the rate of occuring boy word is: ",sum(df.person.str.contains('boy'))/len(df))
# text
text = df.person[1]
print(text)

In this text, the rate of occuring boy word is:  0.06663945300540872
Children smiling and waving at camera


In [15]:
# find regular expression on text
# import regular expression package
import re
# find callouts that starts with @
callouts = [word for word in text.split(" ") if re.search("@[A-Za-z0-9_]+",word)]
print("callouts: ",callouts)

callouts:  []


In [16]:
# find specific characters like "w"
print(re.findall(r"[w]",text))
# "w"ith, "w"indo"w", sho"w"ing, s"w"itches

# do not find specific character like "w". We will use "^" symbol
print(re.findall(r"[^w]",text))

['w']
['C', 'h', 'i', 'l', 'd', 'r', 'e', 'n', ' ', 's', 'm', 'i', 'l', 'i', 'n', 'g', ' ', 'a', 'n', 'd', ' ', 'a', 'v', 'i', 'n', 'g', ' ', 'a', 't', ' ', 'c', 'a', 'm', 'e', 'r', 'a']


In [17]:
# NLP ( Natural Language Processing)

In [18]:
# import natural language tool kit
import nltk as nlp

# counting vocabulary of words
text = df.person[1]
splitted = text.split(" ")
print("number of words: ",len(splitted))

# counting unique vocabulary of words
text = df.person[1]
print("number of unique words: ",len(set(splitted)))

# print first five unique words
print("first 5 unique words: ",list(set(splitted))[:5])

# frequency of words
dist = nlp.FreqDist(splitted)
print("frequency of words: ",dist)

# look at keys in dist
print("words in person: ",dist.keys())

# count how many time a particalar value occurs. Lets look at "box"
print("the word box is occured how many times:",dist["box"])

number of words:  6
number of unique words:  6
first 5 unique words:  ['Children', 'at', 'camera', 'and', 'waving']
frequency of words:  <FreqDist with 6 samples and 6 outcomes>
words in person:  dict_keys(['Children', 'smiling', 'and', 'waving', 'at', 'camera'])
the word box is occured how many times: 0


In [19]:
# Normalization and Stemming words

In [20]:
# normalization
words = "task Tasked tasks tasking"
words_list = words.lower().split(" ")
print("normalized words: ",words_list)

# stemming
porter_stemmer = nlp.PorterStemmer()
roots = [porter_stemmer.stem(each) for each in words_list]
print("roots of task Tasked tasks tasking: ",roots)

normalized words:  ['task', 'tasked', 'tasks', 'tasking']
roots of task Tasked tasks tasking:  ['task', 'task', 'task', 'task']


In [21]:
# Lemmatization

In [22]:
# stemming
import nltk
nltk.download('wordnet')

stemming_word_list = ["baseball","airplane","restaurant","drinking","outdoors"]
porter_stemmer = nlp.PorterStemmer()
roots = [porter_stemmer.stem(each) for each in stemming_word_list]
print("result of stemming: ",roots)

# lemmatization
lemma = nlp.WordNetLemmatizer()
lemma_roots = [lemma.lemmatize(each) for each in stemming_word_list]
print("result of lemmatization: ",lemma_roots)

[nltk_data] Downloading package wordnet to /root/nltk_data...


result of stemming:  ['basebal', 'airplan', 'restaur', 'drink', 'outdoor']
result of lemmatization:  ['baseball', 'airplane', 'restaurant', 'drinking', 'outdoors']


In [23]:
# Tokenization

In [24]:
import nltk
nltk.download('punkt')

text_t = "Two groups of rival gang members flipped each other off."
print("split the sentence: ", text_t.split(" "))  # 5 words

# tokenization with nltk
print("tokenize with nltk: ",nlp.word_tokenize(text_t))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


split the sentence:  ['Two', 'groups', 'of', 'rival', 'gang', 'members', 'flipped', 'each', 'other', 'off.']
tokenize with nltk:  ['Two', 'groups', 'of', 'rival', 'gang', 'members', 'flipped', 'each', 'other', 'off', '.']


In [25]:
# categorical features with missing values
categorical_nan = [feature for feature in df.columns if df[feature].isna().sum()>0 and df[feature].dtypes=='O']
print(categorical_nan)

[]


In [26]:
# replacing missing values in categorical features
for feature in categorical_nan:
    df[feature] = df[feature].fillna('None')

In [27]:
df[categorical_nan].isna().sum()

Series([], dtype: float64)

In [29]:
# creating bag of words model
from sklearn.feature_extraction.text import CountVectorizer  # for bag of words
max_features = 150 # max_features dimension reduction
count_vectorizer = CountVectorizer(stop_words="english", max_features=max_features)
review_list = df.iloc[:,1] #fixed by Billa comment @skbilla

# stop_words parameter = automatically remove all stopwords
# lowercase parameter
# token_pattern removing other karakters like .. !

sparce_matrix = count_vectorizer.fit_transform(review_list).toarray() # sparce matrix yaratir bag of words model = sparce matrix

print("Most used {} words: {}".format(max_features, count_vectorizer.get_feature_names_out()))


y = df.iloc[:,0].values  # positive or negative comment

#sparce matrix includes independent variable

Most used 150 words: ['adults' 'animal' 'asleep' 'baby' 'ball' 'band' 'baseball' 'basketball'
 'beach' 'bed' 'bench' 'bicycle' 'bike' 'black' 'blue' 'boat' 'book' 'boy'
 'boys' 'building' 'camera' 'car' 'cat' 'child' 'children' 'city'
 'clothes' 'construction' 'cooking' 'couch' 'couple' 'crowd' 'dancing'
 'dinner' 'dog' 'dogs' 'doing' 'dress' 'dressed' 'drinking' 'driving'
 'eating' 'female' 'field' 'floor' 'food' 'football' 'game' 'getting'
 'girl' 'girls' 'going' 'grass' 'green' 'ground' 'group' 'guitar' 'guy'
 'hair' 'hands' 'hat' 'having' 'holding' 'home' 'horse' 'house' 'indoors'
 'inside' 'jumping' 'jumps' 'kid' 'kids' 'kitchen' 'lady' 'large' 'laying'
 'little' 'looking' 'looks' 'making' 'man' 'men' 'motorcycle' 'mountain'
 'music' 'near' 'ocean' 'old' 'orange' 'outdoors' 'outside' 'painting'
 'park' 'people' 'performing' 'person' 'phone' 'picture' 'play' 'player'
 'players' 'playing' 'plays' 'pool' 'reading' 'red' 'restaurant' 'rides'
 'riding' 'road' 'room' 'running' 'shirt' '

In [30]:
# train test split
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(sparce_matrix,y,test_size = 0.1,random_state = 0)

In [37]:
# naive bayes
from sklearn.naive_bayes import GaussianNB
import pickle

model_filename = 'naive_bayes_model.pkl'

if os.path.exists(model_filename):
    # Load the model from the file if it exists
    with open(model_filename, 'rb') as file:
        nb = pickle.load(file)
else:
    # Train the model if the file does not exist
    nb = GaussianNB()
    nb.fit(sparce_matrix, y)

    # Save the model to a file
    with open(model_filename, 'wb') as file:
        pickle.dump(nb, file)


In [39]:
plagiarism_file = 'train_snli.txt'
with open(plagiarism_file) as f: # The with keyword automatically closes the file when you are done
    print (f.read(3000))

A person on a horse jumps over a broken down airplane.	A person is at a diner, ordering an omelette.	0
A person on a horse jumps over a broken down airplane.	A person is outdoors, on a horse.	1
Children smiling and waving at camera	There are children present	1
Children smiling and waving at camera	The kids are frowning	0
A boy is jumping on skateboard in the middle of a red bridge.	The boy skates down the sidewalk.	0
A boy is jumping on skateboard in the middle of a red bridge.	The boy does a skateboarding trick.	1
An older man sits with his orange juice at a small table in a coffee shop while employees in bright colored shirts smile in the background.	A boy flips a burger.	0
Two blond women are hugging one another.	The women are sleeping.	0
Two blond women are hugging one another.	There are women showing affection.	1
A few people in a restaurant setting, one of them is drinking orange juice.	The people are sitting at desks in school.	0
A few people in a restaurant setting, one of them