In [26]:
import pandas as pd

df = pd.read_csv('dataset_da21000.csv', delimiter=';', skipinitialspace=True, dtype={'Text': 'str', 'Sentiment': 'str'})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21435 entries, 0 to 21434
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Text       21435 non-null  object
 1   Sentiment  21421 non-null  object
dtypes: object(2)
memory usage: 335.0+ KB


In [24]:
print(df.dtypes)

Text         object
Sentiment    object
dtype: object


In [8]:
df1 = df[df.isna().any(axis=1)]
df1
df.iloc[[386]]

Unnamed: 0,Text,Sentiment
386,ترانسفر رفت تا هتل با ون بود ولی استقبال با نو...,pos


Though we paid for the dataset, it contains null values :)))

In [30]:
df = df.dropna()
df = df.astype('str')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21421 entries, 0 to 21434
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Text       21421 non-null  object
 1   Sentiment  21421 non-null  object
dtypes: object(2)
memory usage: 502.1+ KB


pandas uses numpy and more specifically ndarray in behind. ndarray needs to know the size, so it can for example have the dtype int64 cause it knows the size (8 bytes) but it can't have the dtype str cause str is not fixed size and it has to keep a pointer to it.

A simple SVM

In [19]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

X, y = df['Text'], df['Sentiment']
print(pd.unique(y))

['pos' 'neg']


:))))))))

In [32]:
from sklearn.preprocessing import LabelEncoder

X, y = df['Text'], df['Sentiment']

target=y.values.tolist()
label_encoder = LabelEncoder()
y = np.array(label_encoder.fit_transform(target))

print(pd.unique(y))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


[1 0]


In [None]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LinearSVC()),
                     ])

text_clf.fit(X_train, y_train)


predicted = text_clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))

We should save the model in here 

In [6]:
# Save Model Using Pickle
import pickle

# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(text_clf, open(filename, 'wb'))
 
# some time later...
 

In [10]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.8913566275286462


In [12]:
loaded_model = pickle.load(open(filename, 'rb'))
print(loaded_model.predict(["استاااد می "]))

['neg']


In [1]:
import fasttext.util

fasttext.util.download_model('fa', if_exists='ignore')
ft = fasttext.load_model('cc.fa.300.bin')
print(ft)

<fasttext.FastText._FastText object at 0x7f09ac344e80>




In [2]:
print(ft.get_dimension())
fasttext.util.reduce_model(ft, 50)
print(ft.get_dimension())

300
50


In [4]:
ft.save_model('cc.fa.50.bin')

In [33]:
from keras.layers import Dropout, Dense, GRU, Embedding
from keras.models import Sequential
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn import metrics
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.datasets import fetch_20newsgroups


def loadData_Tokenizer(X_train, X_test, MAX_NB_WORDS=75000, MAX_SEQUENCE_LENGTH=500):
    np.random.seed(7)
    text = np.concatenate((X_train, X_test), axis=0)
    text = np.array(text)
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)
    word_index = tokenizer.word_index
    text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    print('Found %s unique tokens.' % len(word_index))
    indices = np.arange(text.shape[0])
    # np.random.shuffle(indices)
    text = text[indices]
    print(text.shape)
    X_train = text[0:len(X_train), ]
    X_test = text[len(X_train):, ]
    embeddings_index = {}
    f = open("cc.fa.300.vec", encoding="utf8")
    for line in f:
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
        except:
            pass
        embeddings_index[word] = coefs
    f.close()
    print('Total %s word vectors.' % len(embeddings_index))
    return (X_train, X_test, word_index, embeddings_index)

X_train_Glove, X_test_Glove, word_index, embeddings_index = loadData_Tokenizer(X_train, X_test)

Found 58738 unique tokens.
(21421, 500)
Total 2000000 word vectors.


In [34]:
def Build_Model_RNN_Text(word_index, embeddings_index, nclasses, MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=300,
                         dropout=0.5):
    """
    def buildModel_RNN(word_index, embeddings_index, nclasses,  MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=300, dropout=0.5):
    word_index in word index ,
    embeddings_index is embeddings index, look at data_helper.py
    nClasses is number of classes,
    MAX_SEQUENCE_LENGTH is maximum lenght of text sequences
    """

    model = Sequential()
    hidden_layer = 3
    gru_node = 256

    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            if len(embedding_matrix[i]) != len(embedding_vector):
                print("could not broadcast input array from shape", str(len(embedding_matrix[i])),
                      "into shape", str(len(embedding_vector)), " Please make sure your"
                                                                " EMBEDDING_DIM is equal to embedding_vector file ,GloVe,")
                exit(1)
            embedding_matrix[i] = embedding_vector
    model.add(Embedding(len(word_index) + 1,
                        EMBEDDING_DIM,
                        weights=[embedding_matrix],
                        input_length=MAX_SEQUENCE_LENGTH,
                        trainable=True))

    print(gru_node)
    for i in range(0, hidden_layer):
        model.add(GRU(gru_node, return_sequences=True, recurrent_dropout=0.2))
        model.add(Dropout(dropout))
    model.add(GRU(gru_node, recurrent_dropout=0.2))
    # model.add(Dense(, activation='relu'))
    model.add(Dense(nclasses, activation='softmax'))

    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

model_RNN = Build_Model_RNN_Text(word_index, embeddings_index, 20)

model_RNN.summary()


256
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 300)          17621700  
_________________________________________________________________
gru_8 (GRU)                  (None, 500, 256)          428544    
_________________________________________________________________
dropout_6 (Dropout)          (None, 500, 256)          0         
_________________________________________________________________
gru_9 (GRU)                  (None, 500, 256)          394752    
_________________________________________________________________
dropout_7 (Dropout)          (None, 500, 256)          0         
_________________________________________________________________
gru_10 (GRU)                 (None, 500, 256)          394752    
_________________________________________________________________
dropout_8 (Dropout)          (None, 500, 256)     

In [1]:
import tensorflow as tf

X_train_Glove = tf.keras.preprocessing.sequence.pad_sequences(
    X_train_Glove, maxlen=None, padding='pre',
    truncating='pre'
)

model_RNN.fit(X_train_Glove, y_train,
              validation_data=(X_test_Glove, y_test),
              epochs=20,
              batch_size=128,
              verbose=2)

predicted = model_RNN.predict_classes(X_test_Glove)

print(metrics.classification_report(y_test, predicted))

NameError: name 'X_train_Glove' is not defined