In [3]:
# Imports

from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

import pandas as pd

import re

print('Import done.')

ModuleNotFoundError: No module named 'pandas'

In [4]:
# Load data
# Source: Tweets with traffic-related labels for developing a Twitter-based traffic information system.
# https://data.mendeley.com/datasets/c3xvj5snvv/1

trainingFile = 'tweets/1_TrainingSet_2Class.csv'
trainCsv = pd.read_csv(trainingFile, sep=",", header=None)

trainingData = pd.DataFrame({'tweets':trainCsv[2], 'isRoadIncident':trainCsv[0]})[['tweets', 'isRoadIncident']]

trainingData.head()

NameError: name 'pd' is not defined

In [26]:
# Pre-processing

def Preprocess(trainingData):
    stop_words = stopwords.words('english')
    stemmer = WordNetLemmatizer()
    preprocessed_tweets = []

    for tweet in trainingData['tweets']:

        # Remove Punctuation
        tweet = re.sub('[^A-Za-z]', ' ', tweet)

        # Lowercase
        tweet.lower()

        # Tokenize
        tokenized_tweet = word_tokenize(tweet)

        # Remove stopwords
        # There are stop words that actually help identify 
        # incidents, especially ones that indicates location.
        # In some cases accuracy is better without removing
        # stop words. 
        for word in tokenized_tweet:
            if word in stop_words:
                tokenized_tweet.remove(word)

        # Stemming
#         for i in range(len(tokenized_tweet)):
#             tokenized_tweet[i] = stemmer.lemmatize(tokenized_tweet[i])

        # Join the tokenized tweet
        tweet_text = " ".join(tokenized_tweet)

        # Add to list
        preprocessed_tweets.append(tweet_text)

    preprocessed_tweets = pd.Series(preprocessed_tweets)
    trainingData['preprocessed_tweets'] = preprocessed_tweets.values
    return trainingData

trainingData = Preprocess(trainingData)
trainingData.head()

Unnamed: 0,tweets,isRoadIncident,preprocessed_tweets
0,Disabled Vehicle on Westbound highway WB at Em...,1,Disabled Vehicle Westbound highway WB Emily Dr...
1,New Teacher Lunch &amp; training! Marker wars ...,0,New Teacher Lunch amp training Marker war w Gr...
2,And the spot in our #uhaultrends Canadian Des...,0,And spot our uhaultrends Canadian Destination ...
3,"years ago today #MLK gave his historic ""I Hav...",0,year ago today MLK gave historic I Have A Drea...
4,Aww it’s always hard to say goodbye! 😢 What’s...,0,Aww always hard say goodbye What s been favori...


In [27]:
# Create feature matrix

matrix = CountVectorizer(max_features=1000)
X = matrix.fit_transform(trainingData['preprocessed_tweets']).toarray()
print('Feature matrix created.')

Feature matrix created.


In [28]:
# Split data

X_train, X_test, y_train, y_test = train_test_split(X, trainingData['isRoadIncident'])
print('Data split.')

Data split.


In [29]:
# Train

# Naive Bayes 
# classifier = GaussianNB()
# classifier.fit(X_train, y_train)

from sklearn import svm
classifier = svm.SVC()
classifier = classifier.fit(X_train, y_train)
# svm_y_pred = classifier.predict(X_test)

# Predict Class
y_pred = classifier.predict(X_test)

# Accuracy 
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy = ', accuracy*100, '%')

trainingData['predict'] = pd.Series(y_pred)
trainingData.head(100)

Accuracy =  97.20156555772994 %


Unnamed: 0,tweets,isRoadIncident,preprocessed_tweets,predict
0,Disabled Vehicle on Westbound highway WB at Em...,1,Disabled Vehicle Westbound highway WB Emily Dr...,1.0
1,New Teacher Lunch &amp; training! Marker wars ...,0,New Teacher Lunch amp training Marker war w Gr...,1.0
2,And the spot in our #uhaultrends Canadian Des...,0,And spot our uhaultrends Canadian Destination ...,1.0
3,"years ago today #MLK gave his historic ""I Hav...",0,year ago today MLK gave historic I Have A Drea...,0.0
4,Aww it’s always hard to say goodbye! 😢 What’s...,0,Aww always hard say goodbye What s been favori...,1.0
5,DO NOT PAY North Korea another single PENNY! ...,0,DO NOT PAY North Korea another single PENNY Am...,1.0
6,Congrats to sitcimguy for being #Uhaulactive!...,0,Congrats sitcimguy being Uhaulactive Enjoy swa...,0.0
7,"""This Mother’s Day, say thank you. Say, 'I lov...",0,This Mother Day say thank Say I love President...,0.0
8,"We're sorry to hear this, Andreas. Please con...",0,We sorry hear Andreas Please contact local Wor...,1.0
9,Construction on #US40 Both directions from NJ ...,1,Construction US Both direction NJ CR East NJ CR,1.0


In [30]:
# Test

# Load data
# Source: Tweets with traffic-related labels for developing a Twitter-based traffic information system.
# https://data.mendeley.com/datasets/c3xvj5snvv/1
testFile = 'tweets/1_TestSet_2Class.csv'
testCsv = pd.read_csv(testFile, sep=",", header=None)
testData = pd.DataFrame({'tweets':testCsv[2], 'isRoadIncident':testCsv[0]})[['tweets', 'isRoadIncident']]

# Pre-processing
testData = Preprocess(testData)

# Create feature matrix
matrix = CountVectorizer(max_features=1000)
X = matrix.fit_transform(testData['preprocessed_tweets']).toarray()

# Predict
y_test = testData['isRoadIncident']
y_pred = classifier.predict(X)

# Accuracy 
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy = ', accuracy*100, '%')

Accuracy =  50.18099990216221 %


In [56]:
# diff = pd.DataFrame(X, columns=["X"])
# diff
# diff["actual"] = y_test
# diff["predicted"] = y_pred

# incorrect = diff[diff["actual"] != diff["predicted"]]
# incorrect

testData['predict'] = pd.Series(y_pred)
testData.head()

Unnamed: 0,tweets,isRoadIncident,preprocessed_tweets,predict
0,"That's a question for the Gov, but as you ca...",0,that a question the gov as can see FL wil,0
1,"Update: Collision; highway WB: at Exit194, no ...",1,updat collis highway WB exit ln clsd ocrd AM P,0
2,SNAP E&amp;T Learning Academy builds national ...,0,snap E amp T learn academi build nation leader...,0
3,What Do You Mean? Taking it worldwide . #14Days,0,what Do you mean take worldwid day,0
4,Disabled Vehicle on Northbound highway NB at M...,1,disabl vehicl northbound highway NB mile marker,0
