In [29]:
import numpy as np
import pandas as pd 
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [30]:
AccidentDf = pd.read_csv(r"D:\Archive\Kaggle\nlp-getting-started\train.csv") # import train csv"
# the r in this code denotes that it is "raw", i.e. it is not reading the \ as 
# special characters 

In [31]:
AccidentDf.shape # get the shape of the dataframe 

(7613, 5)

In [32]:
AccidentDf.head(10)

# id - a unique identifier for each tweet
# keyword - a particular keyword from the tweet (may be blank)
# location - the location the tweet was sent from (may be blank)
# text - the text of the tweet
# target - in train.csv only, this denotes whether a tweet is about a real disaster (1) or not (0)


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [33]:
# Data Exploration

In [34]:
AccidentDf["keyword"].unique()

array([nan, 'ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
       'buildings%20burning', 'buildings%20on%20fire', 'burned',
       'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
       'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
       'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
       'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
       'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       'derail', 'der

In [35]:
# AccidentDf["keyword"] = AccidentDf["keyword"].fillna("Unknown") # filling in nans
# AccidentDf["location"] = AccidentDf["location"].fillna("Unknown") #

In [36]:
cat_feature = ["keyword", "location"] # defining the categorical features 
cat_transformer = Pipeline(steps = [
    ("imputer", SimpleImputer(missing_values = np.nan, strategy = "constant", fill_value = "Unknown")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
]) # creating a pipeline for the transformation/s for the categorical features

In [37]:
preprocessor = ColumnTransformer(
transformers = [
    ("cat", cat_transformer, cat_feature)
])

In [38]:
clf = Pipeline(steps = [("preprocessor", preprocessor),
              ("classifier", LogisticRegression(solver = "lbfgs"))])
# clf stands for classifier 

In [39]:
X = AccidentDf.drop(["target","id","text"], axis = 1)
y = AccidentDf["target"]

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [41]:
clf.fit(X_train, y_train) 

Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('cat', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value='Unknown', missing_values=nan,
       strategy='constant'...enalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False))])

In [42]:
test = clf.predict(X_train)

In [43]:
len(y_train)

6090

In [50]:
sum(y_train == test)/(len(y_train)) # 81% accuracy for training data predictions 

0.8139573070607553

In [51]:
test1 = clf.predict(X_test)

In [53]:
sum(y_test == test1)/(len(y_test)) # 71% accuracy for test data predictions

0.7189757058437295

In [114]:
AccidentTestDf = pd.read_csv(r"D:\Archive\Kaggle\nlp-getting-started\test.csv") # import test csv"

In [115]:
TestX = AccidentTestDf[["keyword","location"]]

In [116]:
TestPred = pd.Series(clf.predict(AccidentTestDf[["keyword","location"]]))

In [117]:
TestPredDf

In [118]:
TestPredDf = pd.DataFrame(pd.concat([AccidentTestDf["id"], TestPred], axis = 1).set_index("id"))

In [124]:
TestPredDf = TestPredDf.rename(columns = {0:"target"})

In [127]:
TestPredDf.to_csv("test predictions.csv")