In [7]:
# Imports

from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

import pandas as pd

import re

print('Import done.')

Import done.


In [8]:
# Load data
# Source: Tweets with traffic-related labels for developing a Twitter-based traffic information system.
# https://data.mendeley.com/datasets/c3xvj5snvv/1

trainingFile = 'tweets/1_TrainingSet_2Class.csv'
trainCsv = pd.read_csv(trainingFile, sep=",", header=None)

trainingData = pd.DataFrame({'tweets':trainCsv[2], 'isRoadIncident':trainCsv[0]})[['tweets', 'isRoadIncident']]

trainingData.head()

Unnamed: 0,tweets,isRoadIncident
0,Disabled Vehicle on Westbound highway WB at Em...,1
1,New Teacher Lunch &amp; training! Marker wars ...,0
2,And the spot in our #uhaultrends Canadian Des...,0
3,"years ago today #MLK gave his historic ""I Hav...",0
4,Aww it’s always hard to say goodbye! 😢 What’s...,0


In [12]:
# Pre-processing

def Preprocess(trainingData):
    stop_words = stopwords.words('english')
    stemmer = WordNetLemmatizer()
    preprocessed_tweets = []

    for tweet in trainingData['tweets']:

        # Remove Punctuation
        tweet = re.sub('[^A-Za-z]', ' ', tweet)

        # Lowercase
        tweet = tweet.lower()

        # Tokenize
        tokenized_tweet = word_tokenize(tweet)

        # Remove stopwords
        # There are stop words that actually help identify 
        # incidents, especially ones that indicates location.
        # In some cases accuracy is better without removing
        # stop words. 
        for word in tokenized_tweet:
            if word in stop_words:
                tokenized_tweet.remove(word)

        # Stemming
#         for i in range(len(tokenized_tweet)):
#             tokenized_tweet[i] = stemmer.lemmatize(tokenized_tweet[i])

        # Join the tokenized tweet
        tweet_text = " ".join(tokenized_tweet)

        # Add to list
        preprocessed_tweets.append(tweet_text)

    preprocessed_tweets = pd.Series(preprocessed_tweets)
    trainingData['preprocessed_tweets'] = preprocessed_tweets.values
    return trainingData

trainingData = Preprocess(trainingData)
trainingData.head()

Unnamed: 0,tweets,isRoadIncident,preprocessed_tweets
0,Disabled Vehicle on Westbound highway WB at Em...,1,disabled vehicle westbound highway wb emily dr...
1,New Teacher Lunch &amp; training! Marker wars ...,0,new teacher lunch amp training marker wars w g...
2,And the spot in our #uhaultrends Canadian Des...,0,the spot our uhaultrends canadian destination ...
3,"years ago today #MLK gave his historic ""I Hav...",0,years ago today mlk gave historic have dream s...
4,Aww it’s always hard to say goodbye! 😢 What’s...,0,aww s always hard say goodbye s your favorite ...


In [13]:
# Create feature matrix

matrix = CountVectorizer(max_features=1000)
X = matrix.fit_transform(trainingData['preprocessed_tweets']).toarray()
print('Feature matrix created.')

Feature matrix created.


In [14]:
# Split data

X_train, X_test, y_train, y_test = train_test_split(X, trainingData['isRoadIncident'])
print('Data split.')

Data split.


In [15]:
# Train

# Naive Bayes 
# classifier = GaussianNB()
# classifier.fit(X_train, y_train)

from sklearn import svm
classifier = svm.SVC()
classifier = classifier.fit(X_train, y_train)
# svm_y_pred = classifier.predict(X_test)

# Predict Class
y_pred = classifier.predict(X_test)

# Accuracy 
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy = ', accuracy*100, '%')

trainingData['predict'] = pd.Series(y_pred)
trainingData.head(100)

Accuracy =  96.95694716242662 %


Unnamed: 0,tweets,isRoadIncident,preprocessed_tweets,predict
0,Disabled Vehicle on Westbound highway WB at Em...,1,disabled vehicle westbound highway wb emily dr...,0.0
1,New Teacher Lunch &amp; training! Marker wars ...,0,new teacher lunch amp training marker wars w g...,1.0
2,And the spot in our #uhaultrends Canadian Des...,0,the spot our uhaultrends canadian destination ...,1.0
3,"years ago today #MLK gave his historic ""I Hav...",0,years ago today mlk gave historic have dream s...,1.0
4,Aww it’s always hard to say goodbye! 😢 What’s...,0,aww s always hard say goodbye s your favorite ...,1.0
5,DO NOT PAY North Korea another single PENNY! ...,0,not pay north korea another single penny ameri...,1.0
6,Congrats to sitcimguy for being #Uhaulactive!...,0,congrats sitcimguy being uhaulactive enjoy swa...,1.0
7,"""This Mother’s Day, say thank you. Say, 'I lov...",0,mother day say thank say love president obama,0.0
8,"We're sorry to hear this, Andreas. Please con...",0,re sorry hear andreas please contact local wor...,0.0
9,Construction on #US40 Both directions from NJ ...,1,construction us directions nj cr east nj cr,1.0


In [16]:
# Test

# Load data
# Source: Tweets with traffic-related labels for developing a Twitter-based traffic information system.
# https://data.mendeley.com/datasets/c3xvj5snvv/1
testFile = 'tweets/1_TestSet_2Class.csv'
testCsv = pd.read_csv(testFile, sep=",", header=None)
testData = pd.DataFrame({'tweets':testCsv[2], 'isRoadIncident':testCsv[0]})[['tweets', 'isRoadIncident']]

# Pre-processing
testData = Preprocess(testData)

# Create feature matrix
matrix = CountVectorizer(max_features=1000)
X = matrix.fit_transform(testData['preprocessed_tweets']).toarray()

# Predict
y_test = testData['isRoadIncident']
y_pred = classifier.predict(X)

# Accuracy 
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy = ', accuracy*100, '%')

Accuracy =  55.10224048527541 %


In [None]:
# diff = pd.DataFrame(X, columns=["X"])
# diff
# diff["actual"] = y_test
# diff["predicted"] = y_pred

# incorrect = diff[diff["actual"] != diff["predicted"]]
# incorrect

testData['predict'] = pd.Series(y_pred)
testData.head()

In [19]:
trainingData.head(1)

Unnamed: 0,tweets,isRoadIncident,preprocessed_tweets,predict
0,Disabled Vehicle on Westbound highway WB at Em...,1,disabled vehicle westbound highway wb emily dr...,0.0


In [None]:
# AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
# pee is stored in the balls

In [23]:
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

print('Import done')

Import done


In [92]:
# Load data
# Source: Kaggle
# https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus

trainingFile = 'ner/ner_dataset.csv'
trainCsv = pd.read_csv(trainingFile, encoding = "ISO-8859-1")
trainCsv

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
5,,through,IN,O
6,,London,NNP,B-geo
7,,to,TO,O
8,,protest,VB,O
9,,the,DT,O


In [110]:
sents = list()
i = -1
for word in trainCsv.values:
    if type(word[0]) is str:
        sents.append(list())
        i += 1
    corpus = word[1], word[2], word[3]
    sents[i].append(tuple(corpus))
    
sents

[[('Thousands', 'NNS', 'O'),
  ('of', 'IN', 'O'),
  ('demonstrators', 'NNS', 'O'),
  ('have', 'VBP', 'O'),
  ('marched', 'VBN', 'O'),
  ('through', 'IN', 'O'),
  ('London', 'NNP', 'B-geo'),
  ('to', 'TO', 'O'),
  ('protest', 'VB', 'O'),
  ('the', 'DT', 'O'),
  ('war', 'NN', 'O'),
  ('in', 'IN', 'O'),
  ('Iraq', 'NNP', 'B-geo'),
  ('and', 'CC', 'O'),
  ('demand', 'VB', 'O'),
  ('the', 'DT', 'O'),
  ('withdrawal', 'NN', 'O'),
  ('of', 'IN', 'O'),
  ('British', 'JJ', 'B-gpe'),
  ('troops', 'NNS', 'O'),
  ('from', 'IN', 'O'),
  ('that', 'DT', 'O'),
  ('country', 'NN', 'O'),
  ('.', '.', 'O')],
 [('Families', 'NNS', 'O'),
  ('of', 'IN', 'O'),
  ('soldiers', 'NNS', 'O'),
  ('killed', 'VBN', 'O'),
  ('in', 'IN', 'O'),
  ('the', 'DT', 'O'),
  ('conflict', 'NN', 'O'),
  ('joined', 'VBD', 'O'),
  ('the', 'DT', 'O'),
  ('protesters', 'NNS', 'O'),
  ('who', 'WP', 'O'),
  ('carried', 'VBD', 'O'),
  ('banners', 'NNS', 'O'),
  ('with', 'IN', 'O'),
  ('such', 'JJ', 'O'),
  ('slogans', 'NNS', 'O'),
  (

In [132]:
train_sents = sents[:10000]
test_sents = sents[10000:20000]

In [133]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [134]:
sent2features(train_sents[0])[0]


{'+1:postag': 'IN',
 '+1:postag[:2]': 'IN',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:word.lower()': 'of',
 'BOS': True,
 'bias': 1.0,
 'postag': 'NNS',
 'postag[:2]': 'NN',
 'word.isdigit()': False,
 'word.istitle()': True,
 'word.isupper()': False,
 'word.lower()': 'thousands',
 'word[-2:]': 'ds',
 'word[-3:]': 'nds'}

In [135]:
%%time
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

Wall time: 1.35 s


In [136]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

Wall time: 49.7 s


In [130]:
labels = list(crf.classes_)
# labels.remove('B-ORG')
# labels.remove('I-ORG')
# labels.remove('B-PER')
# labels.remove('I-PER')
# labels.remove('B-MISC')
# labels.remove('I-MISC')
labels.remove('O')
labels

['B-geo',
 'B-gpe',
 'B-per',
 'I-geo',
 'B-org',
 'I-org',
 'B-tim',
 'B-art',
 'I-art',
 'I-per',
 'I-gpe',
 'I-tim',
 'B-nat',
 'B-eve',
 'I-eve',
 'I-nat']

In [137]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.8166201625263934

In [144]:
# Test the model with some stuff

sent_text = nltk.sent_tokenize("Big thing in West Bandung.") # this gives us a list of sentences
# now loop over each sentence and tokenize it separately
sent = list()
for sentence in sent_text:
    tokenized_text = nltk.word_tokenize(sentence)
    tagged = nltk.pos_tag(tokenized_text)
    sent.append(tagged)
    print(tagged)
sent

[('Big', 'JJ'), ('thing', 'NN'), ('in', 'IN'), ('West', 'NNP'), ('Bandung', 'NNP'), ('.', '.')]


[[('Big', 'JJ'),
  ('thing', 'NN'),
  ('in', 'IN'),
  ('West', 'NNP'),
  ('Bandung', 'NNP'),
  ('.', '.')]]

In [145]:
test = [sent2features(s) for s in sent]


In [146]:
y_pred = crf.predict(test)
y_pred
# metrics.flat_f1_score(y_test, y_pred,
#                       average='weighted', labels=labels)

[['O', 'O', 'O', 'B-geo', 'I-geo', 'O']]

In [150]:
locs = list()
for i in range (0, len(y_pred)):
    loc = ''
    for j in range(0, len(y_pred[i])):
        if y_pred[i][j] == 'B-geo':
            loc += sent[i][j][0]
        if y_pred[i][j] == 'I-geo':
            loc += ' ' + sent[i][j][0]
    locs.append(loc)
locs


['West Bandung']