In [1]:
# pandas
# marcos de datos
import pandas as pd


# libreria para medir precision
from sklearn.metrics import precision_score


# libreria para redes neuronales
from keras.utils.data_utils import get_file

Using TensorFlow backend.


In [2]:
emotions_source = "https://www.crowdflower.com/wp-content/uploads/2016/07/text_emotion.csv"
emotion_csv = get_file('text_emotion.csv', emotions_source)

In [3]:
emotion_df = pd.read_csv(emotion_csv)
emotion_df.head()

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...


In [4]:
# clasificador de emociones

In [5]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

In [6]:
VOCAB_SIZE = 50000
tfidf_vec = TfidfVectorizer(max_features=VOCAB_SIZE)
label_encoder = LabelEncoder()

In [7]:
emotion_df['content'][0]

'@tiffanylue i know  i was listenin to bad habit earlier and i started freakin at his part =['

In [8]:
emotion_df['sentiment'][0]

'empty'

In [9]:
emotion_df['content'][1]

'Layin n bed with a headache  ughhhh...waitin on your call...'

In [10]:
emotion_df['sentiment'][1]

'sadness'

In [11]:
X = tfidf_vec.fit_transform(emotion_df['content'])
y = label_encoder.fit_transform(emotion_df['sentiment'])

In [12]:
bayes = MultinomialNB()

# entrenamos
bayes.fit(X, y)


# demostracion
pred = bayes.predict(X)

In [13]:
# precision
print("La precision es de {}%".format(str(precision_score(pred, y, average='micro')*100)[:5]) )

La precision es de 43.34%


In [14]:
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse.csc import csc_matrix

In [15]:
clasificadores = {
    "SGD": SGDClassifier(loss='hinge'),
    "SVM": SVC(),
    "arboles_aleatorios": RandomForestClassifier()
}


In [16]:
clasificadores

{'SGD': SGDClassifier(alpha=0.0001, average=False, class_weight=None,
               early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
               l1_ratio=0.15, learning_rate='optimal', loss='hinge',
               max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
               power_t=0.5, random_state=None, shuffle=True, tol=0.001,
               validation_fraction=0.1, verbose=0, warm_start=False),
 'SVM': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
     kernel='rbf', max_iter=-1, probability=False, random_state=None,
     shrinking=True, tol=0.001, verbose=False),
 'arboles_aleatorios': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                        max_depth=None, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_sam

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [18]:
X_train.shape[0]

26800

In [19]:
for etiqueta, clf in clasificadores.items():
    clf.fit(X_train, y_train)
    prediciones = clf.predict(X_test)
    print("La precision del {} es de {}%".format(etiqueta, str(precision_score(prediciones, y_test, average='micro')*100)[:5]) )

La precision del SGD es de 32.85%




La precision del SVM es de 21.86%




La precision del arboles_aleatorios es de 28.71%
