In [294]:
# подготовка инструментария
import pandas as pd
import sklearn
import nltk
import numpy as np
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/ailove/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ailove/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ailove/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [378]:
# загрузка данных
data = pd.read_csv('eng_train.csv')
data = data[['sentiment', 'content']]

In [377]:
# 12 -> 7
r = []
for w in data.sentiment:
    if w == 'empty' or w == 'neutral':
        r.append('empty/neutral')
    elif w == 'worry' or w == 'sadness':
        r.append('worry/sadness')
    elif w == 'surprise' or w == 'relief':
        r.append('surprise/relief')
    elif w == 'love' or w == 'fun' or w == 'happiness':
        r.append('love/fun/happiness')
    elif w == 'hate' or w == 'anger':
        r.append('hate/anger')
    else:
        r.append(w)

In [379]:
# 13 -> 7
r = []
for w in data.sentiment:
    if w == 'empty' or w == 'neutral':
        r.append('neutral')
    elif w == 'worry' or w == 'sadness' or w == 'hate' or w == 'anger' or w == 'boredom':
        r.append('negative')
    else:
        r.append('posetive')

In [380]:
# пытаемся обработать данные вручную (опциональный кусок кода)
import re

# загрузка лемматизатора
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# загрузка стеммера
from nltk.stem import PorterStemmer
ps = PorterStemmer()

# загрузка списка стопслов
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))

processed_content = []

for row in data.content:
    row = row.lower() #приведение всех текстов к lowercase
    row = re.sub('((www\.[\s]+)|(https?://[^\s]+))', 'URL', row) #замена ссылок на URL
    row = re.sub('\_', ' ', row) #удаление подчеркивания
    row = re.sub('\!', ' ATTENTION', row) #замена !
    row = re.sub('\?', ' QUESTION', row) #замена ?
    row = re.sub('@[A-Za-z0-9]+', ' ', row) #удаление mentions
    row = re.sub('\W', ' ', row) #удаление символов
    row = re.sub('\_', ' ', row) #удаление подчеркивания
    row = re.sub('[\s]+', ' ', row) #удаление лишних пробелов
    row = nltk.word_tokenize(row)
    new_row = []
    for word in row:
        #word = ps.stem(word)
        word = lemmatizer.lemmatize(word)
        if word not in stopWords:
            new_row.append(word)
    processed_content.append(new_row)
    
# создание общего словаря слов
all_dict = []
for n in processed_content:
    string = ''
    for m in n:
        string += m + ' '
    all_dict.append(string)
    
# создание базы {words: [], sentiment: []}
raw_data = {}
raw_data['content'] = all_dict
#raw_data['sentiment'] = data.sentiment
raw_data['sentiment'] = r

data = pd.DataFrame(raw_data, columns = ['content', 'sentiment'])

In [381]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(data.content, data.sentiment, test_size = 0.2, random_state = 0)
X_train.head()
Y_train.head()

3225     posetive
11815    negative
7338     negative
14980    negative
27167    posetive
Name: sentiment, dtype: object

In [382]:
#считаем векторы
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

(24000, 20879)

In [383]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(24000, 20879)

In [384]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(24000, 20879)

In [385]:
#NB
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, Y_train)
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()),])# пайплайн
text_clf.fit(X_train, Y_train)
predicted = text_clf.predict(X_test)
np.mean(predicted == Y_test)

0.5341666666666667

In [389]:
#SVM
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge',alpha=1e-3, random_state=42, max_iter=100, tol=None))])
text_clf.fit(X_train, Y_train)
predicted = text_clf.predict(X_test)
np.mean(predicted == Y_test)

0.5561666666666667

In [390]:
#RandomForest
from sklearn.ensemble import RandomForestClassifier
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_split=20, max_features="auto", random_state=42))])
text_clf.fit(X_train, Y_train)
predicted = text_clf.predict(X_test)
np.mean(predicted == Y_test)

0.5795

In [388]:
#ExtraTrees
from sklearn.ensemble import ExtraTreesClassifier
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0))])
text_clf.fit(X_train, Y_train)
predicted = text_clf.predict(X_test)
np.mean(predicted == Y_test)

0.5416666666666666