# Tweets

In [1]:
!pip install -r requirements.txt



## Imports

In [2]:
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer
# from nltk.stem import WordNetLemmatizer

# from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import re
import string

# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix

# from spellchecker import SpellChecker
import text_cleninig
import text_processing
import machine_learning
import word2vec
import contractions

In [3]:
'''
In case of problems with SSL in nltk.download
https://github.com/gunthercox/ChatterBot/issues/930#issuecomment-322111087
'''
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# nltk.download()

In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /Users/dbadeev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dbadeev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/dbadeev/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/dbadeev/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Obtaining data

We have 3 datasets with data for positive, negative and neutral tweets stored in 3 csv files.
Let's create a dataframe of those data.

In [5]:
negative = pd.DataFrame(pd.read_csv('data/processedNegative.csv').T).reset_index()
negative = negative.rename(columns={'index': 'tweets'})
negative['type'] = -1

In [6]:
neutral = pd.DataFrame(pd.read_csv('data/processedNeutral.csv').T).reset_index()
neutral = neutral.rename(columns={'index': 'tweets'})
neutral['type'] = 0

In [7]:
positive = pd.DataFrame(pd.read_csv('data/processedPositive.csv').T).reset_index()
positive = positive.rename(columns={'index': 'tweets'})
positive['type'] = 1

In [8]:
frames = [positive, negative, neutral]
df = pd.concat(frames)
df.head()

Unnamed: 0,tweets,type
0,An inspiration in all aspects: Fashion,1
1,fitness,1
2,beauty and personality. :)KISSES TheFashionIcon,1
3,Apka Apna Awam Ka Channel Frankline Tv Aam Adm...,1
4,Beautiful album from the greatest unsung guit...,1


### Let's create a dataframe for our results (different preprocessing techinques and different vectorizing methods). Dataframe is filled with NaN at this moment

In [9]:
preprocessing = ['just tokenization', 'stemming', 'lemmatization', 'stemming + misspellings',
                                                                    'lemmatization + misspellings', 'any other ideas']
vectorizers = ['0 or 1, if the word exists', 'word counts', 'TFIDF']
df_df = pd.DataFrame(columns=vectorizers, index=preprocessing)
df_df

Unnamed: 0,"0 or 1, if the word exists",word counts,TFIDF
just tokenization,,,
stemming,,,
lemmatization,,,
stemming + misspellings,,,
lemmatization + misspellings,,,
any other ideas,,,


## Data Preparation

### Functions

##### Text cleaning funcs

- contractions to full form
- replace_emoticons with text
- remove ticks and next symbol
- remove url (http*)
- remove hashtags (#)
- remove mentions (@)
- remove numbers
- ignore case
- ignore punctuation
- remove stop words (optional)
- remove misspelling (optional)
- remove extra spaces



### Just Tokenization

Tokenization is a common task a data scientist comes across when working with text data. It consists of splitting an entire text into small units, also known as tokens. Most Natural Language Processing (NLP) projects have tokenization as the first step because it’s the foundation for developing good models and helps better understand the text we have.

In [10]:
df_token = df.copy(deep=True)

#### apply clean function

In [11]:
df_token['tweets'] = df_token.apply(lambda item: text_cleninig.clean(item.tweets), axis=1)

In [12]:
df_token.head()

Unnamed: 0,tweets,type
0,an inspiration in all aspects fashion,1
1,fitness,1
2,beauty and personality happy face or smiley ki...,1
3,apka apna awam ka channel frankline tv aam adm...,1
4,beautiful album from the greatest unsung guita...,1


> ### 0 or 1, if the word exists

In [13]:
df_token_exist = text_processing.word_exists(df_token, 'tweets')
df_df['0 or 1, if the word exists'][0] = df_token_exist


In [14]:
df_token_exist.head()

Unnamed: 0_level_0,aa,aah,aam,aamby,aando,aap,aaree,abbeydale,abbreviation,abc,...,yr,yummy,yura,yuri,zabardast,zac,zcc,zero,zoo,zoos
tweets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
an inspiration in all aspects fashion,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
fitness,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
beauty and personality happy face or smiley kisses thefashionicon,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
apka apna awam ka channel frankline tv aam admi production please visit or likes share happy face or smiley fb page,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
beautiful album from the greatest unsung guitar genius of our time and i have met the great backstage,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


> ### word count

In [15]:
df_token_count = text_processing.word_count(df_token, 'tweets')

In [17]:
df_df['word counts'][0] = df_token_count

> ### tfidf

In [18]:
df_token_tfidf = text_processing.tfidf(df_token, 'tweets')

In [19]:
df_df['TFIDF'][0] = df_token_tfidf

### Stemming

Stemming: This removes the difference between the inflected form of a word to reduce each word to its root form. This is done by mostly chopping off the end of words. One problem with streaming is that chopping words may result in words that are not part of the vocabulary. PorterStemmer and LancasterStemmer are two popular algorithms for streaming, which have rules on how to chop off a word.

In [20]:
porter = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer('english')

Let's consider several stemmers: Porter Stemmer, Snowball Stemmer and Lancaster Stemmer

#### Snowball

In [21]:
df_stemmed_snow = df.copy(deep=True)
df_stemmed_snow['tweets'] = df_stemmed_snow.apply(lambda item: text_processing.stem_text(item.tweets, snowball), axis=1)
df_stemmed_snow.head()

Unnamed: 0,tweets,type
0,an inspir in all aspect fashion,1
1,fit,1
2,beauti and person happi face or smiley kiss th...,1
3,apka apna awam ka channel franklin tv aam admi...,1
4,beauti album from the greatest unsung guitar g...,1


#### Lancaster

In [22]:
df_stemmed_lanc = df.copy(deep=True)
df_stemmed_lanc['tweets'] = df_stemmed_lanc.apply(lambda item: text_processing.stem_text(item.tweets, lancaster), axis=1)
df_stemmed_lanc.head()

Unnamed: 0,tweets,type
0,an inspir in al aspect fash,1
1,fit,1
2,beauty and person happy fac or smiley kiss the...,1
3,apk apn awam ka channel franklin tv aam adm pr...,1
4,beauty alb from the greatest unsung guit geni ...,1


#### Porter

In [23]:
df_stemmed_porter = df.copy(deep=True)
df_stemmed_porter['tweets'] = df_stemmed_porter.apply(lambda item: text_processing.stem_text(item.tweets, porter), axis=1)
df_stemmed_porter.head()

Unnamed: 0,tweets,type
0,an inspir in all aspect fashion,1
1,fit,1
2,beauti and person happi face or smiley kiss th...,1
3,apka apna awam ka channel franklin tv aam admi...,1
4,beauti album from the greatest unsung guitar g...,1


> ### 0 or 1, if the word exists

In [24]:
df_stem_exist = text_processing.word_exists(df_stemmed_snow, 'tweets')

In [25]:
df_stem_exist.info()
df_df['0 or 1, if the word exists'][1] = df_stem_exist

<class 'pandas.core.frame.DataFrame'>
Index: 3873 entries, an inspir in all aspect fashion to amulya patnaik has been appoint new delhi polic commission patnaik is a agmut cadr ip offic
Columns: 4927 entries, aa to zoo
dtypes: int64(4927)
memory usage: 145.6+ MB


> ### word count

In [26]:
df_stem_count = text_processing.word_count(df_stemmed_snow, 'tweets')

In [27]:
df_stem_count.info()
df_df['word counts'][1] = df_stem_count

<class 'pandas.core.frame.DataFrame'>
Index: 3873 entries, an inspir in all aspect fashion to amulya patnaik has been appoint new delhi polic commission patnaik is a agmut cadr ip offic
Columns: 4927 entries, aa to zoo
dtypes: int64(4927)
memory usage: 145.6+ MB


> ### tfidf

In [28]:
df_stem_tfidf = text_processing.tfidf(df_stemmed_snow, 'tweets')

In [29]:
df_stem_tfidf.info()
df_df['TFIDF'][1] = df_stem_tfidf

<class 'pandas.core.frame.DataFrame'>
Index: 3873 entries, an inspir in all aspect fashion to amulya patnaik has been appoint new delhi polic commission patnaik is a agmut cadr ip offic
Columns: 4927 entries, aa to zoo
dtypes: float64(4927)
memory usage: 145.6+ MB


### Lemmatization

In contrast to stemming, lemmatization is a lot more powerful. It looks beyond word reduction and considers a language’s full vocabulary to apply a morphological analysis to words, aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma.

In [30]:
df_lemmatized = df.copy(deep=True)
df_lemmatized['tweets'] = df_lemmatized.apply(lambda item: text_processing.lem_text(item.tweets), axis=1)
df_lemmatized.head()

Unnamed: 0,tweets,type
0,an inspiration in all aspect fashion,1
1,fitness,1
2,beauty and personality happy face or smiley ki...,1
3,apka apna awam ka channel frankline tv aam adm...,1
4,beautiful album from the greatest unsung guita...,1


> ### 0 or 1, if the word exists

In [31]:
df_lem_exist = text_processing.word_exists(df_lemmatized, 'tweets')

In [32]:
df_lem_exist.info()
df_df['0 or 1, if the word exists'][2] = df_lem_exist

<class 'pandas.core.frame.DataFrame'>
Index: 3873 entries, an inspiration in all aspect fashion to amulya patnaik ha been appointed new delhi police commissioner patnaik is a agmut cadre ip officer
Columns: 5632 entries, aa to zoo
dtypes: int64(5632)
memory usage: 166.4+ MB


> ### word count

In [33]:
df_lem_count = text_processing.word_count(df_lemmatized, 'tweets')

In [34]:
df_lem_count.info()
df_df['word counts'][2] = df_lem_count

<class 'pandas.core.frame.DataFrame'>
Index: 3873 entries, an inspiration in all aspect fashion to amulya patnaik ha been appointed new delhi police commissioner patnaik is a agmut cadre ip officer
Columns: 5632 entries, aa to zoo
dtypes: int64(5632)
memory usage: 166.4+ MB


> ### tfidf

In [35]:
df_lem_tfidf = text_processing.tfidf(df_lemmatized, 'tweets')

In [36]:
df_lem_tfidf.info()
df_df['TFIDF'][2] = df_lem_tfidf

<class 'pandas.core.frame.DataFrame'>
Index: 3873 entries, an inspiration in all aspect fashion to amulya patnaik ha been appointed new delhi police commissioner patnaik is a agmut cadre ip officer
Columns: 5632 entries, aa to zoo
dtypes: float64(5632)
memory usage: 166.4+ MB


### Stemming + misspellings

In [37]:
df_stem_spell_snow = df.copy(deep=True)
df_stem_spell_snow['tweets'] = df_stem_spell_snow.apply(lambda item: text_processing.stem_text(item.tweets, snowball, misspelling=True), axis=1)


> ### 0 or 1, if the word exists

In [38]:
df_stem_spell_exist = text_processing.word_exists(df_stem_spell_snow, 'tweets')

In [39]:
df_stem_spell_exist.info()
df_df['0 or 1, if the word exists'][3] = df_stem_spell_exist

<class 'pandas.core.frame.DataFrame'>
Index: 3873 entries, an inspir in all aspect fashion to amulet patsak has been appoint new delhi polic commission patsak is a gamut cadr ip offic
Columns: 4553 entries, abb to zoo
dtypes: int64(4553)
memory usage: 134.6+ MB


> ### word count

In [40]:
df_stem_spell_count = text_processing.word_count(df_stemmed_snow, 'tweets')

In [41]:
df_stem_spell_count.info()
df_df['word counts'][3] = df_stem_spell_count

<class 'pandas.core.frame.DataFrame'>
Index: 3873 entries, an inspir in all aspect fashion to amulya patnaik has been appoint new delhi polic commission patnaik is a agmut cadr ip offic
Columns: 4927 entries, aa to zoo
dtypes: int64(4927)
memory usage: 145.6+ MB


> ### tfidf

In [42]:
df_stem_spell_tfidf = text_processing.tfidf(df_stemmed_snow, 'tweets')

In [43]:
df_stem_spell_tfidf.info()
df_df['TFIDF'][3] = df_stem_spell_tfidf

<class 'pandas.core.frame.DataFrame'>
Index: 3873 entries, an inspir in all aspect fashion to amulya patnaik has been appoint new delhi polic commission patnaik is a agmut cadr ip offic
Columns: 4927 entries, aa to zoo
dtypes: float64(4927)
memory usage: 145.6+ MB


### Lemmatization + misspellings

In [44]:
df_lemmatized = df.copy(deep=True)
df_lemmatized['tweets'] = df_lemmatized.apply(lambda item: text_processing.lem_text(item.tweets, misspelling=True), axis=1)

> ### 0 or 1, if the word exists

In [45]:
df_lem_spell_exist = text_processing.word_exists(df_lemmatized, 'tweets')

In [46]:
df_lem_spell_exist.info()
df_df['0 or 1, if the word exists'][4] = df_lem_spell_exist

<class 'pandas.core.frame.DataFrame'>
Index: 3873 entries, an inspiration in all aspect fashion to amulet patsak ha been appointed new delhi police commissioner patsak is a gamut cadre ip officer
Columns: 5275 entries, abb to zoo
dtypes: int64(5275)
memory usage: 155.9+ MB


> ### word count

In [47]:
df_lem_spell_count = text_processing.word_count(df_lemmatized, 'tweets')

In [48]:
df_lem_spell_count.info()
df_df['word counts'][4] = df_lem_spell_count

<class 'pandas.core.frame.DataFrame'>
Index: 3873 entries, an inspiration in all aspect fashion to amulet patsak ha been appointed new delhi police commissioner patsak is a gamut cadre ip officer
Columns: 5275 entries, abb to zoo
dtypes: int64(5275)
memory usage: 155.9+ MB


> ### tfidf

In [49]:
df_lem_spell_tfidf = text_processing.tfidf(df_lemmatized, 'tweets')

In [50]:
df_lem_spell_tfidf.info()
df_df['TFIDF'][4] = df_lem_spell_tfidf

<class 'pandas.core.frame.DataFrame'>
Index: 3873 entries, an inspiration in all aspect fashion to amulet patsak ha been appointed new delhi police commissioner patsak is a gamut cadre ip officer
Columns: 5275 entries, abb to zoo
dtypes: float64(5275)
memory usage: 155.9+ MB


### Other ideas of preprocessing

> ### 0 or 1, if the word exists

> ### word count

> ### tfidf

## Similarity

## Machine learning

### Logistic regression

In [51]:
clf = LogisticRegression(random_state=21)

In [52]:
import machine_learning

df_res = machine_learning.model_preprocessing(clf, df, df_df)

In [53]:
df_res

Unnamed: 0,"0 or 1, if the word exists",word counts,TFIDF
just tokenization,0.903226,0.904516,0.889032
stemming,0.909677,0.905806,0.892903
lemmatization,0.900645,0.900645,0.887742
stemming + misspellings,0.908387,0.905806,0.892903
lemmatization + misspellings,0.901935,0.903226,0.887742
any other ideas,,,


### Gaussian Naive Bayes

In [54]:
clf = GaussianNB()

In [56]:
df_res = machine_learning.model_preprocessing(clf, df, df_df)

In [57]:
df_res

Unnamed: 0,"0 or 1, if the word exists",word counts,TFIDF
just tokenization,0.749677,0.750968,0.727742
stemming,0.775484,0.772903,0.732903
lemmatization,0.763871,0.766452,0.723871
stemming + misspellings,0.749677,0.772903,0.732903
lemmatization + misspellings,0.745806,0.745806,0.699355
any other ideas,,,


### Random Forest

In [58]:
clf = RandomForestClassifier(max_depth=2, random_state=0)

In [59]:
df_res = machine_learning.model_preprocessing(clf, df, df_df)

In [60]:
df_res

Unnamed: 0,"0 or 1, if the word exists",word counts,TFIDF
just tokenization,0.460645,0.460645,0.455484
stemming,0.458065,0.458065,0.460645
lemmatization,0.47871,0.47871,0.474839
stemming + misspellings,0.48,0.458065,0.460645
lemmatization + misspellings,0.469677,0.469677,0.468387
any other ideas,,,


In [61]:
clf = RandomForestClassifier(n_estimators = 1000, criterion = 'entropy', random_state = 42)

In [62]:
df_res = machine_learning.model_preprocessing(clf, df, df_df)

In [63]:
df_res

Unnamed: 0,"0 or 1, if the word exists",word counts,TFIDF
just tokenization,0.910968,0.910968,0.908387
stemming,0.910968,0.912258,0.907097
lemmatization,0.907097,0.904516,0.904516
stemming + misspellings,0.903226,0.912258,0.907097
lemmatization + misspellings,0.908387,0.905806,0.905806
any other ideas,,,


### Desicion tree

In [64]:
clf = tree.DecisionTreeClassifier()

In [66]:
df_res = machine_learning.model_preprocessing(clf, df, df_df)

In [67]:
df_res

Unnamed: 0,"0 or 1, if the word exists",word counts,TFIDF
just tokenization,0.877419,0.876129,0.882581
stemming,0.876129,0.872258,0.883871
lemmatization,0.865806,0.870968,0.872258
stemming + misspellings,0.88,0.874839,0.877419
lemmatization + misspellings,0.877419,0.882581,0.872258
any other ideas,,,


## Checklist

- final table with all algos and parameters and scores exist
- all 18 means of preprocessing used and corresponding datasets saved
- top 10 most similar tweets found for all 18 means of preprocessing
- >0.83
- grid search was used for finding best params
- word-to-vect

### Some useful links

- https://www.datacamp.com/community/tutorials/stemming-lemmatization-python
- sentiment analysis https://www.analyticsvidhya.com/blog/2021/09/sentiment-classification-using-nlp-with-text-analytics/
- https://becominghuman.ai/nlp-classifying-positive-and-negative-restaurant-reviews-bag-of-words-model-31e9abfd7286
- Comments classification https://github.com/msahamed/yelp_comments_classification_nlp/blob/master/word_embeddings.ipynb
- tokenization https://towardsdatascience.com/5-simple-ways-to-tokenize-text-in-python-92c6804edfc4
- Lemmatization https://pythobyte.com/stemming-and-lemmatization-82464/
- Fundamentals of Bag Of Words and TF-IDF https://medium.com/analytics-vidhya/fundamentals-of-bag-of-words-and-tf-idf-9846d301ff22
- How to Vectorize Text in DataFrames for NLP Tasks https://towardsdatascience.com/how-to-vectorize-text-in-dataframes-for-nlp-tasks-3-simple-techniques-82925a5600db
- Stemming и лемматизация в Python https://pythobyte.com/stemming-and-lemmatization-82464/
- https://www.bigdataschool.ru/blog/pyspark-vectorization.html
- https://towardsdatascience.com/benchmarking-python-nlp-tokenizers-3ac4735100c5
- https://stackoverflow.com/questions/45312377/how-to-one-hot-encode-from-a-pandas-column-containing-a-list
- different stemmers https://machinelearningknowledge.ai/beginners-guide-to-stemming-in-python-nltk/
- preprocessing https://dataaspirant.com/nlp-text-preprocessing-techniques-implementation-python/#t-1600081660724