In [1]:
import pandas as pd
import numpy as np

In [2]:
#Extraction des data
def get_data (url):
    data = pd.read_csv(url,sep="\t", header=None)
    data.columns = ["id", "label", "text"]
    return data

train_data = get_data('twitter-2013train-A.txt')
dev_data = get_data('twitter-2013dev-A.txt')
test_data = get_data('twitter-2013test-A.txt')

train_data

Unnamed: 0,id,label,text
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I\u2019m going t...
1,263405084770172928,negative,Theo Walcott is still shit\u002c watch Rafa an...
2,262163168678248449,negative,its not that I\u2019m a GSP fan\u002c i just h...
3,264249301910310912,negative,Iranian general says Israel\u2019s Iron Dome c...
4,262682041215234048,neutral,Tehran\u002c Mon Amour: Obama Tried to Establi...
...,...,...,...
9679,103158179306807296,positive,RT @MNFootNg It's monday and Monday Night Foot...
9680,103157324096618497,positive,All I know is the road for that Lomardi start ...
9681,100259220338905089,neutral,"All Blue and White fam, we r meeting at Golden..."
9682,104230318525001729,positive,@DariusButler28 Have a great game agaist Tam...


In [3]:
#2) Mise en place du lexique du corpus
#nous supprimons toutes les ponctuations
# avec string.punctuation: !"#$%&'()*+, -./:;?@[\]^_`{|}~
import string
def remove_punctuation(text):
    translator = str.maketrans('','', string.punctuation)
    return text.translate(translator)

#nous supprimons les espaces en trop dans un tweet
def remove_whitespace(text):
    return " ".join(text.split())

#tokenisation
from nltk.tokenize import word_tokenize

def tokenization(text):
    tokens = word_tokenize(text)
    return tokens

#Amelioration
#Supprimons la casse
def text_lowercase(text):
    return text.lower()

#Stemming: Ne garder que la racine des mots
from nltk.stem.porter import PorterStemmer
def stem_words(tokens):
    stemmer = PorterStemmer()
    stems = [stemmer.stem(word) for word in tokens]
    return stems

#Suppression des mots outils, des mots qui n'apporte pas d'information donc on peut les retirer sans affecter le sens de la phrase
from nltk.corpus import stopwords

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'));
    return [token for token in tokens if token not in stop_words]
#construction du lexique

def create_lexique(data):
    lexique = []
    for tweet in data['text']:
        lower_tweet = text_lowercase(tweet)
        tweet_without_punctuation = remove_punctuation(lower_tweet)
        new_tweet = remove_whitespace(lower_tweet)
        tokens = tokenization(new_tweet)
        tokens_with_stemming = stem_words(tokens)
        tokens_wthout_stpWords = remove_stopwords(tokens_with_stemming)
        for word in tokens:
            if word not in lexique:
                lexique.append(word)
    return lexique
    
train_data_lexique = create_lexique(train_data)


['gas',
 'by',
 'my',
 'house',
 'hit',
 '$',
 '3.39',
 '!',
 'i\\u2019m',
 'going',
 'to',
 'chapel',
 'hill',
 'on',
 'sat',
 '.',
 ':',
 ')',
 'theo',
 'walcott',
 'is',
 'still',
 'shit\\u002c',
 'watch',
 'rafa',
 'and',
 'johnny',
 'deal',
 'with',
 'him',
 'saturday',
 'its',
 'not',
 'that',
 'a',
 'gsp',
 'fan\\u002c',
 'i',
 'just',
 'hate',
 'nick',
 'diaz',
 'can\\u2019t',
 'wait',
 'for',
 'february',
 'iranian',
 'general',
 'says',
 'israel\\u2019s',
 'iron',
 'dome',
 'their',
 'missiles',
 '(',
 'keep',
 'talking',
 'like',
 'we',
 'may',
 'end',
 'up',
 'finding',
 'out',
 'tehran\\u002c',
 'mon',
 'amour',
 'obama',
 'tried',
 'establish',
 'ties',
 'the',
 'mullahs',
 'http',
 '//t.co/tzzzrrka',
 'via',
 '@',
 'pjmedia_com',
 'no',
 'barack',
 '-',
 'vote',
 'mitt',
 'romney',
 'through',
 'this',
 'whole',
 'movie',
 'harry',
 'ron',
 'at',
 'christmas',
 'ohlawd',
 'j',
 'davlar',
 '11th',
 'main',
 'rivals',
 'are',
 'team',
 'poland',
 'hopefully',
 'an',
 'make

In [4]:
#Assignation d'un numero unique a chaque lexique

def assign_id_to_lexique(lexique):
    new_lexique = {}
    for i in range(1,len(lexique)+1):
        new_lexique[lexique[i-1]] = i
    return new_lexique

train_lexique_with_id = assign_id_to_lexique(train_data_lexique)
len(train_lexique_with_id)


27486

In [6]:
#4) Decomptage pour chaque message, le nombre d'occurence des mots
# [0 for i in range(len(train_data['text'])]


def words_occurence(data, data_lexique_with_id):
    tab = []
    for tweet in data['text']:
        vector = [0 for i in range(len(data_lexique_with_id)+1)]
        for word in tokenization(tweet):
            if word in data_lexique_with_id.keys():
                vector[data_lexique_with_id[word]] +=1
               
        tab.append(vector)
    return np.array(tab)

train_tab = words_occurence(train_data, train_lexique_with_id)
dev_tab = words_occurence(dev_data, train_lexique_with_id)
test_tab = words_occurence(test_data, train_lexique_with_id)




[[0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 1]]
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [8]:
test_tab.shape

(3547, 27487)

In [9]:
#5) construction du fichier au format supporte par le SVM

def generate_svm_file(data, tab, file):
    dict = {}
    label_vector = []
    for label in data['label']:
        if label == 'positive':
            label = 1
        elif label == 'negative':
            label = 2
        elif label == 'neutral':
            label = 3
        label_vector.append(label)
    #train_svm = open("train.svm", "a")
    #for tweet in train_data['text']:
    train_data
    label_vector = np.array(label_vector)
    #print(np.array(label_vector))
    # il faut construire une structure qui contient, pour chauque ligne,le label, le mot avec sa clef et l'occurence

    # la j'ajoute a chaque ligne le label correspondant au tweet
    new_vector = np.insert(tab, 0, label_vector, axis=1)
    #print(new_vector)
    fichier = open(file, "a")
    for tweet in new_vector:
        label = str(tweet[0])
        s = label
        for i in range(0, len(tweet)-1):
            c=i+1
            var = str(tweet[c])
            if tweet[c] != 0:
                s = s + ' ' + str(c) + ':' + var
        s += '\n'
        fichier.write(s)
    fichier.close()

In [10]:
generate_svm_file(train_data, train_tab, "train_svm.svm")
generate_svm_file(dev_data, dev_tab, "dev_svm.svm")
generate_svm_file(test_data, test_tab, "test_svm.svm")

In [None]:
#Accuracy : 59.4587%
