In [161]:
import sys
#!{sys.executable} -m pip install pandas
#!{sys.executable} -m pip install numpy
#!{sys.executable} -m pip install sklearn
#!{sys.executable} -m pip install seaborn
#!{sys.executable} -m pip install matplotlib

In [162]:
import pandas as pd
import numpy as np
import os
import nltk
import re
import string
import seaborn as sn
import matplotlib.pyplot as plt

In [163]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

In [164]:
## count the number of characters
def count_chars(text):
    return len(text)

## count the number of words
def count_words(text):
    return len(text.split())

## count the number of sentences
def count_sent(text):
    return len(nltk.sent_tokenize(text))

## count the number of unique words within the tweet
def count_unique(text):
    return len(set(text.split()))

## count hashtags
def count_htags(text):
    return len(re.findall(r'(#w[A-Za-z0-9]*)', text))

## count capital letters
def count_capital_chars(text):
    count = 0
    for x in text:
        if x.isupper():
            count+=1
    return count

## count capital words
def count_capital_words(text):
    return sum(map(str.isupper, text.split()))

## count stopwords
def count_stopwords(text):
    ## not sure how to do
    return 0

In [165]:
    for dirname, _, filenames in os.walk('.\data'):
        for filename in filenames:
            print(os.path.join(dirname, filename))

    train = pd.read_csv('./data/train.csv')
    print("load data done")
    
    print('training: ' + str(train.shape))
    print('total keywords NA: ' + str(train.keyword.nunique()))
    print('total locations NA:' + str(train.location.nunique()))   

.\data\sample_submission.csv
.\data\test.csv
.\data\train.csv
load data done
training: (7613, 5)
total keywords NA: 221
total locations NA:3341


In [166]:
    # make feature columns
    train['char_count'] = train['text'].apply(lambda x:count_chars(x))
    train['word_count'] = train['text'].apply(lambda x:count_words(x))
    train['sent_count'] = train['text'].apply(lambda x:count_sent(x))
    train['cap_char_count'] = train['text'].apply(lambda x:count_capital_chars(x))
    train['cap_word_count'] = train['text'].apply(lambda x:count_capital_words(x))
    train['unique_word_count'] = train['text'].apply(lambda x:count_unique(x))
    train['htag_count'] = train['text'].apply(lambda x:count_htags(x))
    # average word length
    train['avg_word_length'] = train['char_count']/train['word_count']
    # average sentence length
    train['avg_sentence_length'] = train['word_count']/train['sent_count']
    # fraction of unique to total words
    train['unique_v_words'] = train['unique_word_count']/train['word_count']
    
    train.drop(columns=['id', 'keyword', 'location'],inplace=True)
    print(train.head())

                                                text  target  char_count  \
0  Our Deeds are the Reason of this #earthquake M...       1          69   
1             Forest fire near La Ronge Sask. Canada       1          38   
2  All residents asked to 'shelter in place' are ...       1         133   
3  13,000 people receive #wildfires evacuation or...       1          65   
4  Just got sent this photo from Ruby #Alaska as ...       1          88   

   word_count  sent_count  cap_char_count  cap_word_count  unique_word_count  \
0          13           1              10               1                 13   
1           7           2               5               0                  7   
2          22           2               2               0                 20   
3           8           1               1               0                  8   
4          16           1               3               0                 15   

   htag_count  avg_word_length  avg_sentence_length  unique_v_

In [167]:
def remove_links(tweet):
    '''Takes a string and removes web links from it'''
    tweet = re.sub(r'httpS+', '', tweet) # remove http links
    tweet = re.sub(r'bit.ly/S+', '', tweet) # rempve bitly links
    tweet = tweet.strip('[link]') # remove [links]
    return tweet
def remove_users(tweet):
    '''Takes a string and removes retweet and @user information'''
    tweet = re.sub('(RTs@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove retweet
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove tweeted at
    return tweet
def preprocess(sent):
    sent = remove_users(sent)
    sent = remove_links(sent)
    sent = sent.lower() # lower case
    sent = re.sub('['+string.punctuation + ']+', ' ', sent) # strip punctuation
    sent = re.sub(r'\s+',' ', sent) #remove double spacing
    sent = re.sub('([0-9]+)', '', sent) # remove numbers
    sent_token_list = [word for word in sent.split(' ')]
    sent = ' '.join(sent_token_list)
    return(sent)
train['text'] = train['text'].apply(lambda x: preprocess(x))
print(train.head())


                                                text  target  char_count  \
0  our deeds are the reason of this earthquake ma...       1          69   
1              forest fire near la ronge sask canada       1          38   
2  all residents asked to shelter in place are be...       1         133   
3    people receive wildfires evacuation orders i...       1          65   
4  just got sent this photo from ruby alaska as s...       1          88   

   word_count  sent_count  cap_char_count  cap_word_count  unique_word_count  \
0          13           1              10               1                 13   
1           7           2               5               0                  7   
2          22           2               2               0                 20   
3           8           1               1               0                  8   
4          16           1               3               0                 15   

   htag_count  avg_word_length  avg_sentence_length  unique_v_

In [168]:
def obtainTargets(data):
    return data['target'].to_numpy()

In [169]:
#train['text'] = train['text'].apply(lambda x: remove_URL(x)) \
 #             .apply(lambda x: remove_html(x)) \
 #              .apply(lambda x: remove_emoji(x)) \
 #             .apply(lambda x: remove_at(x)) \
#.apply(lambda x: remove_punct(x)) 
vectorizer = TfidfVectorizer()
train_tf_idf_features =  vectorizer.fit_transform(train['text'].astype(str).values.tolist()).toarray()
print(train_tf_idf_features)
train_tf_idf = pd.DataFrame(train_tf_idf_features)


train_Y = obtainTargets(train)
train.drop(columns=['target'])
features = ['char_count', 'word_count', 'sent_count',
       'cap_char_count', 'cap_word_count', 'unique_word_count', 'htag_count',
        'avg_word_length', 'avg_sentence_length', 'unique_v_words']

train = pd.merge(train_tf_idf,train[features],left_index=True, right_index=True)




[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [170]:
print(train.head())

     0    1    2    3    4    5    6    7    8    9  ...  char_count  \
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...          69   
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...          38   
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...         133   
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...          65   
4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...          88   

   word_count  sent_count  cap_char_count  cap_word_count  unique_word_count  \
0          13           1              10               1                 13   
1           7           2               5               0                  7   
2          22           2               2               0                 20   
3           8           1               1               0                  8   
4          16           1               3               0                 15   

   htag_count  avg_word_length  avg_sentence_length  unique_v_words  
0           0   

In [171]:
X_train, X_test, y_train, y_test = train_test_split(train, train_Y, test_size=0.2)

In [160]:
logistic_regression_model = LogisticRegression(verbose=3, solver="saga").fit(X_train, y_train)
print(logistic_regression_model.score(vectorizer.transform(X_test), y_test))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


max_iter reached after 92 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.5min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.5min finished


AttributeError: 'int' object has no attribute 'lower'