#### 신기훈 씨가 만드신 코드

In [1]:
import pandas as pd

### Load Dataset

In [2]:
train = pd.read_csv("data/train.tsv", sep="\t", index_col="PhraseId")

print(train.shape)
train.head()

(156060, 3)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,A series of escapades demonstrating the adage ...,1
2,1,A series of escapades demonstrating the adage ...,2
3,1,A series,2
4,1,A,2
5,1,series,2


In [3]:
test = pd.read_csv("data/test.tsv", sep="\t", index_col="PhraseId")

print(test.shape)
test.head()

(66292, 2)


Unnamed: 0_level_0,SentenceId,Phrase
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,8545,An intermittently pleasing but mostly routine ...
156062,8545,An intermittently pleasing but mostly routine ...
156063,8545,An
156064,8545,intermittently pleasing but mostly routine effort
156065,8545,intermittently pleasing but mostly routine


## Preprocessing

In [4]:
train["Phrase(origin)"] = train["Phrase"].copy()

print(train.shape)
train[["Phrase", "Phrase(origin)"]].head()

(156060, 4)


Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
2,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
3,A series,A series
4,A,A
5,series,series


In [5]:
test["Phrase(origin)"] = test["Phrase"].copy()

print(test.shape)
test[["Phrase", "Phrase(origin)"]].head()

(66292, 3)


Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156062,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156063,An,An
156064,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
156065,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


### Clean Text

In [6]:
def clean_text(phrase):
    phrase = phrase.replace("doesn't ", "does not ")
    phrase = phrase.replace("ca n't ", "can not ")
    phrase = phrase.replace(" n't ", " not ")

    return phrase

train["Phrase"] = train["Phrase"].apply(clean_text)

print(train.shape)
train[["Phrase", "Phrase(origin)"]].head()

(156060, 4)


Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
2,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
3,A series,A series
4,A,A
5,series,series


### Clean Text - Lemmatizer

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def clean_phrase_lemmatizer(phrase):
    words = phrase.split(' ')
    lemmatized_words = [lemmatizer.lemmatize(w) for w in words]
    lemmatized_phrase = ' '.join(lemmatized_words)
    return lemmatized_phrase

In [30]:
#train에 적용
train["Phrase"] = train["Phrase"].apply(clean_phrase_lemmatizer)

print(train.shape)
train[["Phrase", "Phrase(origin)"]].head()

(156060, 4)


Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,A series of escapade demonstrating the adage t...,A series of escapades demonstrating the adage ...
2,A series of escapade demonstrating the adage t...,A series of escapades demonstrating the adage ...
3,A series,A series
4,A,A
5,series,series


In [8]:
test["Phrase"] = test["Phrase"].apply(clean_text)
test["Phrase"] = test["Phrase"].apply(clean_phrase_lemmatizer)

print(test.shape)
test[["Phrase", "Phrase(origin)"]].head()

(66292, 3)


Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156062,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156063,An,An
156064,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
156065,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


### Vectorize phrases

In [142]:
from sklearn.feature_extraction.text import CountVectorizer

# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# binary=True/False
# lowercase=True/False
# ngram_range=(1, 1)
# stop_words=None

# vectorizer = CountVectorizer(max_features=1000)
vectorizer = CountVectorizer(max_features=10000, ngram_range=(1,2))
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+( [a-zA-Z]+)?\\b',
        tokenizer=None, vocabulary=None)

In [157]:
# Phrase 칼럼에 있는걸 vectorizer에 넣음. 이 train은 이미 lemmatize 되어 있음
vectorizer.fit(train["Phrase"])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+( [a-zA-Z]+)?\\b',
        tokenizer=None, vocabulary=None)

In [144]:
X_train = vectorizer.transform(train["Phrase"])
X_train

<156060x10000 sparse matrix of type '<class 'numpy.int64'>'
	with 705165 stored elements in Compressed Sparse Row format>

In [145]:
columns = vectorizer.get_feature_names()
pd.DataFrame(X_train[:100].toarray(), columns=columns).head()

Unnamed: 0,Unnamed: 1,Unnamed: 2,a,about,act,acted,acting,action,actor,add,...,yourself,youth,yu,zaidan,zaidan supposed,zeal,zhang,zone,zone.1,zone with
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [146]:
X_test = vectorizer.transform(test["Phrase"])
X_test

<66292x10000 sparse matrix of type '<class 'numpy.int64'>'
	with 254359 stored elements in Compressed Sparse Row format>

In [147]:
y_train = train["Sentiment"]

print(y_train.shape)
y_train.head()

(156060,)


PhraseId
1    1
2    2
3    2
4    2
5    2
Name: Sentiment, dtype: int64

## Score

In [148]:
from sklearn.linear_model import SGDClassifier

model = SGDClassifier(random_state=37)
model

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=37, shuffle=True,
       tol=None, verbose=0, warm_start=False)

<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

In [149]:
# from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GroupKFold

# kfold = GroupKFold(n_splits=5)
kfold = GroupKFold(n_splits=7)


y_predict = cross_val_predict(model, X_train, y_train,cv=kfold,groups=train['SentenceId'])

print(y_predict.shape)
y_predict[0:10]



(156060,)


array([3, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [150]:
from sklearn.metrics import accuracy_score

score = accuracy_score(y_train, y_predict)
print("Score = {0:.5f}".format(score))

Score = 0.53485


### score init: 0.583602 (countVectorize: 1,2 가 max)
주신코드: 0.58407

kFold:6 > 0.58435 // 7 > 0.58480 // 8> 0.58439

In [151]:
import numpy as np

result = train.copy()
result["Sentiment(predict)"] = y_predict
result["Difference(Phrase)"] = np.abs(y_train - y_predict)

print(result.shape)
result.head()

(156060, 6)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase(origin),Sentiment(predict),Difference(Phrase)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,A series of escapade demonstrating the adage t...,1,A series of escapades demonstrating the adage ...,3,2
2,1,A series of escapade demonstrating the adage t...,2,A series of escapades demonstrating the adage ...,2,0
3,1,A series,2,A series,2,0
4,1,A,2,A,2,0
5,1,series,2,series,2,0


In [152]:
sentiment = result.groupby("SentenceId")["Difference(Phrase)"].mean()
print(sentiment.shape)
sentiment.head()

(8529,)


SentenceId
1    0.174603
2    0.611111
3    0.342857
4    0.700000
5    0.800000
Name: Difference(Phrase), dtype: float64

In [153]:
def find_sentiment(sentence_id):
    return sentiment.loc[sentence_id]

result["Difference(Sentence)"] = result["SentenceId"].apply(find_sentiment)
result = result.sort_values(by="Difference(Sentence)", ascending=False)

print(result.shape)
result.head()

(156060, 7)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase(origin),Sentiment(predict),Difference(Phrase),Difference(Sentence)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
104273,5501,"good action , good acting , good dialogue , go...",4,"good action , good acting , good dialogue , go...",0,4,2.666667
104280,5501,good action,3,good action,2,1,2.666667
104271,5501,"good action , good acting , good dialogue , go...",4,"good action , good acting , good dialogue , go...",0,4,2.666667
104272,5501,"good action , good acting , good dialogue , go...",4,"good action , good acting , good dialogue , go...",0,4,2.666667
104274,5501,"good action , good acting , good dialogue , go...",4,"good action , good acting , good dialogue , go...",0,4,2.666667


In [154]:
vocabulary = vectorizer.get_feature_names()
result['Idx'] = result.index
def find_words(phraseId_id):
    vector = X_train[phraseId_id-1].toarray().reshape(-1)
    vocabulary_pairs = list(zip(vocabulary, vector))

    words = [word for word, count in vocabulary_pairs if count != 0]
    return words

# vocabulary.append("no fun")  # 아무 영향 X

result_final = result[0:1000].copy()
result_final["Keywords"] = result_final['Idx'].apply(find_words)
result_final.head()


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase(origin),Sentiment(predict),Difference(Phrase),Difference(Sentence),Idx,Keywords
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
104273,5501,"good action , good acting , good dialogue , go...",4,"good action , good acting , good dialogue , go...",0,4,2.666667,104273,"[ acting, action, dialogue, pace]"
104280,5501,good action,3,good action,2,1,2.666667,104280,[ action]
104271,5501,"good action , good acting , good dialogue , go...",4,"good action , good acting , good dialogue , go...",0,4,2.666667,104271,"[ acting, action, cinematography, dialogue,..."
104272,5501,"good action , good acting , good dialogue , go...",4,"good action , good acting , good dialogue , go...",0,4,2.666667,104272,"[ acting, action, cinematography, dialogue,..."
104274,5501,"good action , good acting , good dialogue , go...",4,"good action , good acting , good dialogue , go...",0,4,2.666667,104274,"[ acting, action, dialogue, pace]"


In [155]:
# type(vocabulary)

In [156]:
result_final.to_csv("result.csv")

PermissionError: [Errno 13] Permission denied: 'result.csv'

In [None]:
vocabulary = vectorizer.get_feature_names()
vocabulary[0:3]

In [None]:
pd.DataFrame(vocabulary, columns=["word"]).to_csv("vocabulary.csv")

In [None]:
result[result["Phrase"].str.contains("can not recommend")]

## Train

In [None]:
model.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)

print(predictions.shape)
predictions[0:10]

## Submit

In [None]:
submission = pd.read_csv("data/sampleSubmission.csv", index_col="PhraseId")

submission["Sentiment"] = predictions

print(submission.shape)
submission.head()

In [None]:
# 경로(ex: baseline-script.csv)는 사용자 설정마다 다름
submission.to_csv("baseline-script.csv")