In [76]:
import pandas as pd
import numpy as np
import re
from sklearn import linear_model, model_selection, preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV, train_test_split
from scipy.sparse import hstack
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [77]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [78]:
def read_from_file(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        data = file.read().splitlines()
    return data

input_filename_countries = 'country_names.txt'
input_filename_cities = 'city_names.txt'
country_names = read_from_file(input_filename_countries)
city_names = read_from_file(input_filename_cities)

In [79]:
def find_location_name(location, names):
    for name in names:
        if name in location:
            return name
    return ''

In [80]:
train_data.location = train_data.location.fillna(' ')
test_data.location = test_data.location.fillna(' ')
train_data.location = train_data.location.str.lower()
test_data.location = test_data.location.str.lower()

locations_tr = [find_location_name(loc, city_names) for loc in train_data.location]
locations_ts = [find_location_name(loc, city_names) for loc in test_data.location]

train_data.location = locations_tr
test_data.location = locations_ts

In [81]:
#loc_values = location_tr.unique()
#loc_dict = dict(zip(loc_values, [i for i in range(len(loc_values))]))
#train_data.location = train_data.location.map(loc_dict)

In [82]:
locations_tr_array = np.array(locations_tr, dtype='<U') 
output_filename = 'locations_tr.txt'
np.savetxt(output_filename, locations_tr_array, fmt='%s')

In [83]:
replace_list = [':', ';', ',', '.', '?', '!', '(', ')', '*', '{', '}', '[', ']', '-', '^']
pattern = '|'.join([re.escape(char) for char in replace_list])
#train_data.text = train_data.text.str.replace(pattern, '', regex=True)
#test_data.text = test_data.text.str.replace(pattern, '', regex=True)
#train_data.text = train_data.text.str.replace(r'\bhttp\S+\b', '', regex=True)
#test_data.text = test_data.text.str.replace(r'\bhttp\S+\b', '', regex=True)
#train_data.text = train_data.text.str.replace(r'\b@\S+\b', '', regex=True)
#test_data.text = test_data.text.str.replace(r'\b@\S+\b', '', regex=True)
train_data.text = train_data.text.str.replace('%20', ' ')
train_data.text = train_data.text.str.replace('%20', ' ')
train_data.text = train_data.text.str.lower()
test_data.text = test_data.text.str.lower()

train_data.keyword = train_data.keyword.fillna(' ')
test_data.keyword = test_data.keyword.fillna(' ')
train_data.keyword = train_data.keyword.str.lower()
test_data.keyword = test_data.keyword.str.lower()
train_data.keyword = train_data.keyword.str.replace('%20', ' ')
test_data.keyword = test_data.keyword.str.replace('%20', ' ')

In [84]:
train_data.text.iloc[1260]

'1943: poland - work party prisoners in the nazi death camp treblinka rebelled seizing small arms and setting buildings on fire. #history'

In [85]:
count_vectorizer = CountVectorizer()
all_vector = count_vectorizer.fit_transform(train_data.text + train_data.keyword + train_data.location)

In [86]:
all_vector.shape

(7613, 23099)

In [87]:
text_vector_tr = count_vectorizer.transform(train_data.text)
text_vector_ts = count_vectorizer.transform(test_data.text)
key_vector_tr = count_vectorizer.transform(train_data.keyword)
key_vector_ts = count_vectorizer.transform(test_data.keyword)
loc_vector_tr = count_vectorizer.transform(train_data.location)
loc_vector_ts = count_vectorizer.transform(test_data.location)

In [88]:
text_vector_tr.max()

13

In [89]:
x_train = hstack([text_vector_tr, key_vector_tr, loc_vector_tr])
x_test = hstack([text_vector_ts, key_vector_ts, loc_vector_ts])
y_train = train_data.target

In [90]:
x_train.shape

(7613, 69297)

In [91]:
#clf = linear_model.RidgeClassifier()
clf = LogisticRegression(max_iter=1000)
#clf = RandomForestClassifier(random_state=42)

In [92]:
scores = model_selection.cross_val_score(clf, x_train, y_train, cv=3, scoring='f1')
scores.mean()

0.6325372259488835

In [56]:
clf.fit(x_train, y_train)
submission.target = clf.predict(x_test)
submission.to_csv('output.csv', index=False)