In [1]:
import pandas as pd
import numpy as np
import nltk
from functools import reduce
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams
from nltk.corpus import words
from nltk.metrics.distance  import edit_distance
from textblob import TextBlob 
import re 
import pickle

In [2]:
# ! pip install textblob

In [3]:
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')
# nltk.download('words')

## загрузка данных 

In [4]:
neutral = pd.read_csv('./data/processedNeutral.csv').T.reset_index()
neutral = neutral.rename(columns = {'index': 'tweet'})
neutral['target'] = 'neutral'
neutral.head()

Unnamed: 0,tweet,target
0,Pak PM survives removal scare,neutral
1,but court orders further probe into corruptio...,neutral
2,Supreme Court quashes criminal complaint again...,neutral
3,Art of Living's fights back over Yamuna floodp...,neutral
4,livid.,neutral


In [5]:
neutral.shape

(1570, 2)

In [6]:
positive = pd.read_csv('./data/processedPositive.csv').T.reset_index()
positive = positive.rename(columns = {'index': 'tweet'})
positive['target'] = 'positive'
positive.head()

Unnamed: 0,tweet,target
0,An inspiration in all aspects: Fashion,positive
1,fitness,positive
2,beauty and personality. :)KISSES TheFashionIcon,positive
3,Apka Apna Awam Ka Channel Frankline Tv Aam Adm...,positive
4,Beautiful album from the greatest unsung guit...,positive


In [7]:
positive.shape

(1186, 2)

In [8]:
negative = pd.read_csv('./data/processedNegative.csv').T.reset_index()
negative = negative.rename(columns = {'index': 'tweet'})
negative['target'] = 'negative'
negative.head()

Unnamed: 0,tweet,target
0,How unhappy some dogs like it though,negative
1,talking to my over driver about where I'm goin...,negative
2,Does anybody know if the Rand's likely to fall...,negative
3,I miss going to gigs in Liverpool unhappy,negative
4,There isnt a new Riverdale tonight ? unhappy,negative


In [9]:
negative.shape

(1117, 2)

#### объединим все данные в один набор для простаты работы с ним

In [10]:
data = pd.concat([neutral, negative, positive], axis=0).reset_index().drop('index', axis=1)
data.head()

Unnamed: 0,tweet,target
0,Pak PM survives removal scare,neutral
1,but court orders further probe into corruptio...,neutral
2,Supreme Court quashes criminal complaint again...,neutral
3,Art of Living's fights back over Yamuna floodp...,neutral
4,livid.,neutral


In [11]:
data = data.sample(frac = 1).reset_index().drop('index', axis=1)

In [12]:
data.shape[0] == negative.shape[0] + neutral.shape[0] + positive.shape[0] 

True

## удаление пуктуации

In [13]:
data['my_preproces'] = data['tweet'].apply(lambda x: re.sub(r'[^\w\s]','', x))

## токенизация по словам

In [14]:
data['just_token'] = data['tweet'].apply(lambda x: nltk.word_tokenize(x))
data['stemming'] = data['tweet'].apply(lambda x: nltk.word_tokenize(x))
data['lemmatization'] = data['tweet'].apply(lambda x: nltk.word_tokenize(x))
data['stemming+'] = data['tweet'].apply(lambda x: nltk.word_tokenize(x))
data['misspelling'] = data['tweet'].apply(lambda x: nltk.word_tokenize(x))
data['lemmatization + misspelling'] = data['tweet'].apply(lambda x: nltk.word_tokenize(x))
data['my_preproces'] = data['tweet'].apply(lambda x: nltk.word_tokenize(x))


In [15]:
data['my_preproces'] = data['my_preproces'].apply(lambda x: [i.lower() for i in x])
data.head()

Unnamed: 0,tweet,target,my_preproces,just_token,stemming,lemmatization,stemming+,misspelling,lemmatization + misspelling
0,one direction (band) whats going on unhappy,negative,"[one, direction, (, band, ), whats, going, on,...","[one, direction, (, band, ), whats, going, on,...","[one, direction, (, band, ), whats, going, on,...","[one, direction, (, band, ), whats, going, on,...","[one, direction, (, band, ), whats, going, on,...","[one, direction, (, band, ), whats, going, on,...","[one, direction, (, band, ), whats, going, on,..."
1,Winners of awards at The Telegraph's - Advanta...,neutral,"[winners, of, awards, at, the, telegraph, 's, ...","[Winners, of, awards, at, The, Telegraph, 's, ...","[Winners, of, awards, at, The, Telegraph, 's, ...","[Winners, of, awards, at, The, Telegraph, 's, ...","[Winners, of, awards, at, The, Telegraph, 's, ...","[Winners, of, awards, at, The, Telegraph, 's, ...","[Winners, of, awards, at, The, Telegraph, 's, ..."
2,Enforcement Directorate Chief Karnal Singh get...,neutral,"[enforcement, directorate, chief, karnal, sing...","[Enforcement, Directorate, Chief, Karnal, Sing...","[Enforcement, Directorate, Chief, Karnal, Sing...","[Enforcement, Directorate, Chief, Karnal, Sing...","[Enforcement, Directorate, Chief, Karnal, Sing...","[Enforcement, Directorate, Chief, Karnal, Sing...","[Enforcement, Directorate, Chief, Karnal, Sing..."
3,to retain TV innings.,neutral,"[to, retain, tv, innings, .]","[to, retain, TV, innings, .]","[to, retain, TV, innings, .]","[to, retain, TV, innings, .]","[to, retain, TV, innings, .]","[to, retain, TV, innings, .]","[to, retain, TV, innings, .]"
4,widening trade deficit to $9.84bn.,neutral,"[widening, trade, deficit, to, $, 9.84bn, .]","[widening, trade, deficit, to, $, 9.84bn, .]","[widening, trade, deficit, to, $, 9.84bn, .]","[widening, trade, deficit, to, $, 9.84bn, .]","[widening, trade, deficit, to, $, 9.84bn, .]","[widening, trade, deficit, to, $, 9.84bn, .]","[widening, trade, deficit, to, $, 9.84bn, .]"


## Cтемминг текста

In [16]:
stemmer = PorterStemmer()
data['stemming'] = data['stemming'].apply(lambda sen: [stemmer.stem(word) for word in sen])
data['misspelling'] = data['misspelling'].apply(lambda sen: [stemmer.stem(word) for word in sen])

## Cтемминг + текста
Создание стеммера со снежным углом
Он также известен как алгоритм Porter2 stemming, поскольку он имеет тенденцию исправлять несколько недостатков в Porter stemmer. Давайте посмотрим, как его использовать.

In [17]:
snow_stemmer = SnowballStemmer(language='english')
data['stemming+'] = data['stemming+'].apply(lambda sen: [snow_stemmer.stem(word) for word in sen])

## Лемматизация  текста

In [18]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = word[0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [19]:
lemmatizer = WordNetLemmatizer()

In [20]:
data['lemmatization'] = data['lemmatization'].apply(lambda sen: 
[lemmatizer.lemmatize(word[0],  get_wordnet_pos(word[1])) for word in nltk.pos_tag(sen)])


data['lemmatization + misspelling'] = data['lemmatization + misspelling'].apply(lambda sen: 
[lemmatizer.lemmatize(word[0],  get_wordnet_pos(word[1])) for word in nltk.pos_tag(sen)])

data['my_preproces'] = data['my_preproces'].apply(lambda sen: 
[lemmatizer.lemmatize(word[0],  get_wordnet_pos(word[1])) for word in nltk.pos_tag(sen)])

In [21]:
data.head()

Unnamed: 0,tweet,target,my_preproces,just_token,stemming,lemmatization,stemming+,misspelling,lemmatization + misspelling
0,one direction (band) whats going on unhappy,negative,"[one, direction, (, band, ), whats, go, on, un...","[one, direction, (, band, ), whats, going, on,...","[one, direct, (, band, ), what, go, on, unhappi]","[one, direction, (, band, ), whats, go, on, un...","[one, direct, (, band, ), what, go, on, unhappi]","[one, direct, (, band, ), what, go, on, unhappi]","[one, direction, (, band, ), whats, go, on, un..."
1,Winners of awards at The Telegraph's - Advanta...,neutral,"[winner, of, award, at, the, telegraph, 's, -,...","[Winners, of, awards, at, The, Telegraph, 's, ...","[winner, of, award, at, the, telegraph, 's, -,...","[Winners, of, award, at, The, Telegraph, 's, -...","[winner, of, award, at, the, telegraph, 's, -,...","[winner, of, award, at, the, telegraph, 's, -,...","[Winners, of, award, at, The, Telegraph, 's, -..."
2,Enforcement Directorate Chief Karnal Singh get...,neutral,"[enforcement, directorate, chief, karnal, sing...","[Enforcement, Directorate, Chief, Karnal, Sing...","[enforc, director, chief, karnal, singh, get, ...","[Enforcement, Directorate, Chief, Karnal, Sing...","[enforc, director, chief, karnal, singh, get, ...","[enforc, director, chief, karnal, singh, get, ...","[Enforcement, Directorate, Chief, Karnal, Sing..."
3,to retain TV innings.,neutral,"[to, retain, tv, inning, .]","[to, retain, TV, innings, .]","[to, retain, tv, inning, .]","[to, retain, TV, inning, .]","[to, retain, tv, inning, .]","[to, retain, tv, inning, .]","[to, retain, TV, inning, .]"
4,widening trade deficit to $9.84bn.,neutral,"[widen, trade, deficit, to, $, 9.84bn, .]","[widening, trade, deficit, to, $, 9.84bn, .]","[widen, trade, deficit, to, $, 9.84bn, .]","[widen, trade, deficit, to, $, 9.84bn, .]","[widen, trade, deficit, to, $, 9.84bn, .]","[widen, trade, deficit, to, $, 9.84bn, .]","[widen, trade, deficit, to, $, 9.84bn, .]"


##  стоп слова

In [22]:
def stopWords_fun(x):
    if x not in stopWords and len(x) >= 3:
        return True
    return False

In [23]:
stopWords = set(stopwords.words('english'))

In [24]:
data['my_preproces'] = data['my_preproces'].apply(lambda sen: list(filter(stopWords_fun, sen)))
data.head()

Unnamed: 0,tweet,target,my_preproces,just_token,stemming,lemmatization,stemming+,misspelling,lemmatization + misspelling
0,one direction (band) whats going on unhappy,negative,"[one, direction, band, whats, unhappy]","[one, direction, (, band, ), whats, going, on,...","[one, direct, (, band, ), what, go, on, unhappi]","[one, direction, (, band, ), whats, go, on, un...","[one, direct, (, band, ), what, go, on, unhappi]","[one, direct, (, band, ), what, go, on, unhappi]","[one, direction, (, band, ), whats, go, on, un..."
1,Winners of awards at The Telegraph's - Advanta...,neutral,"[winner, award, telegraph, advantage, 2017]","[Winners, of, awards, at, The, Telegraph, 's, ...","[winner, of, award, at, the, telegraph, 's, -,...","[Winners, of, award, at, The, Telegraph, 's, -...","[winner, of, award, at, the, telegraph, 's, -,...","[winner, of, award, at, the, telegraph, 's, -,...","[Winners, of, award, at, The, Telegraph, 's, -..."
2,Enforcement Directorate Chief Karnal Singh get...,neutral,"[enforcement, directorate, chief, karnal, sing...","[Enforcement, Directorate, Chief, Karnal, Sing...","[enforc, director, chief, karnal, singh, get, ...","[Enforcement, Directorate, Chief, Karnal, Sing...","[enforc, director, chief, karnal, singh, get, ...","[enforc, director, chief, karnal, singh, get, ...","[Enforcement, Directorate, Chief, Karnal, Sing..."
3,to retain TV innings.,neutral,"[retain, inning]","[to, retain, TV, innings, .]","[to, retain, tv, inning, .]","[to, retain, TV, inning, .]","[to, retain, tv, inning, .]","[to, retain, tv, inning, .]","[to, retain, TV, inning, .]"
4,widening trade deficit to $9.84bn.,neutral,"[widen, trade, deficit, 9.84bn]","[widening, trade, deficit, to, $, 9.84bn, .]","[widen, trade, deficit, to, $, 9.84bn, .]","[widen, trade, deficit, to, $, 9.84bn, .]","[widen, trade, deficit, to, $, 9.84bn, .]","[widen, trade, deficit, to, $, 9.84bn, .]","[widen, trade, deficit, to, $, 9.84bn, .]"


In [25]:
data.dropna().shape

(3873, 9)

## исправление ошибок

In [26]:
correct_spellings = words.words()

In [27]:
def edit_distance_fun(entry):
    try:
        temp = [(edit_distance(entry, w),w) for w in correct_spellings if w[0]==entry[0]]
        return sorted(temp, key = lambda val:val[0])[0][1]
    except:
        return entry

In [28]:
data['misspelling'] =  data['misspelling'].apply(lambda sen: [TextBlob(word).correct() for word in sen])

In [29]:
data['lemmatization + misspelling'] = data['lemmatization + misspelling']\
.apply(lambda sen: [TextBlob(word).correct() for word in sen])

In [31]:
data['my_preproces'] = data['my_preproces']\
.apply(lambda sen: [TextBlob(word).correct() for word in sen])

In [32]:
data.head()

Unnamed: 0,tweet,target,my_preproces,just_token,stemming,lemmatization,stemming+,misspelling,lemmatization + misspelling
0,one direction (band) whats going on unhappy,negative,"[(o, n, e), (d, i, r, e, c, t, i, o, n), (b, a...","[one, direction, (, band, ), whats, going, on,...","[one, direct, (, band, ), what, go, on, unhappi]","[one, direction, (, band, ), whats, go, on, un...","[one, direct, (, band, ), what, go, on, unhappi]","[(o, n, e), (d, i, r, e, c, t), ((), (b, a, n,...","[(o, n, e), (d, i, r, e, c, t, i, o, n), ((), ..."
1,Winners of awards at The Telegraph's - Advanta...,neutral,"[(w, i, n, n, e, r), (a, w, a, r, d), (t, e, l...","[Winners, of, awards, at, The, Telegraph, 's, ...","[winner, of, award, at, the, telegraph, 's, -,...","[Winners, of, award, at, The, Telegraph, 's, -...","[winner, of, award, at, the, telegraph, 's, -,...","[(w, i, n, n, e, r), (o, f), (a, w, a, r, d), ...","[(D, i, n, n, e, r, s), (o, f), (a, w, a, r, d..."
2,Enforcement Directorate Chief Karnal Singh get...,neutral,"[(e, n, f, o, r, c, e, m, e, n, t), (d, i, r, ...","[Enforcement, Directorate, Chief, Karnal, Sing...","[enforc, director, chief, karnal, singh, get, ...","[Enforcement, Directorate, Chief, Karnal, Sing...","[enforc, director, chief, karnal, singh, get, ...","[(e, n, f, o, r, c, e), (d, i, r, e, c, t, o, ...","[(E, n, f, o, r, c, e, m, e, n, t), (D, i, r, ..."
3,to retain TV innings.,neutral,"[(r, e, t, a, i, n), (w, i, n, n, i, n, g)]","[to, retain, TV, innings, .]","[to, retain, tv, inning, .]","[to, retain, TV, inning, .]","[to, retain, tv, inning, .]","[(t, o), (r, e, t, a, i, n), (t, o), (w, i, n,...","[(t, o), (r, e, t, a, i, n), (o, f), (w, i, n,..."
4,widening trade deficit to $9.84bn.,neutral,"[(w, i, d, e, n), (t, r, a, d, e), (d, e, f, i...","[widening, trade, deficit, to, $, 9.84bn, .]","[widen, trade, deficit, to, $, 9.84bn, .]","[widen, trade, deficit, to, $, 9.84bn, .]","[widen, trade, deficit, to, $, 9.84bn, .]","[(w, i, d, e, n), (t, r, a, d, e), (d, e, f, i...","[(w, i, d, e, n), (t, r, a, d, e), (d, e, f, i..."


In [33]:
with open('./data/data.pickle', 'wb') as f:
    pickle.dump(data, f)

## Создаем векторы документа

In [34]:
def save_file(name_file, postfix):
    print(name_file, postfix)
    df = data[name_file].dropna().copy()
    if name_file in ['misspelling', 'lemmatization + misspelling', 'my_preproces']:
        df = df.apply(lambda sen: " ".join([str(word) for word in sen]))
    else:
        df = df.apply(lambda tokenized_sms:  " ".join(tokenized_sms))
    matrix = vectorizer.fit_transform(df)
    with open(f'./data/{name_file}_{postfix}.pickle', 'wb') as f:
        pickle.dump(matrix, f)

##### word exist 

In [35]:
vectorizer = CountVectorizer()
for col in data.columns[2:]:
    save_file(col, 'word_exist')

my_preproces word_exist
just_token word_exist
stemming word_exist
lemmatization word_exist
stemming+ word_exist
misspelling word_exist
lemmatization + misspelling word_exist


In [None]:
# data['my_preproces']

##### word count 

In [36]:
vectorizer = CountVectorizer()
for col in data.columns[2:]:
    save_file(col, 'word_count')

my_preproces word_count
just_token word_count
stemming word_count
lemmatization word_count
stemming+ word_count
misspelling word_count
lemmatization + misspelling word_count


##### Tfidf

In [37]:
vectorizer = TfidfVectorizer()
for col in data.columns[2:]:
    save_file(col, 'tfidf')

my_preproces tfidf
just_token tfidf
stemming tfidf
lemmatization tfidf
stemming+ tfidf
misspelling tfidf
lemmatization + misspelling tfidf


In [None]:
# vectorizer = TfidfVectorizer()
# df = data['just_token'].apply(lambda sen: " ".join([str(word) for word in sen]))
# matrix = vectorizer.fit_transform(df)
# matrix.toarray()
# m = matrix.toarray()
# for i in range(10):
#     print(sum((m[i] != 0)))
    
# print('\n====\n')
# for i in m[0]:
#     if i != 0:
#         print(i)

In [None]:
data['my_preproces'].drop_duplicates().reset_index()