# Categorical Data

>Implementation attempting to create a LogisticRegression() pipeline with categorical and numerical values

In [9]:
import pandas as pd

df = pd.read_csv("df_char.csv")
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.head()

Unnamed: 0,tweet_id,priority,category,postCategories_x,eventID,eventType,matchedName
0,211565974422425600,0.75,serviceavailable,19,fireColorado2012,Unknown,[]
1,211654415503990784,0.5,news,15,fireColorado2012,Unknown,[]
2,211681309368655872,0.25,news,15,fireColorado2012,Unknown,['informationwanted']
3,211685621125742592,0.25,official,16,fireColorado2012,Unknown,['volunteer']
4,211877049147736064,0.25,factoid,7,fireColorado2012,Unknown,['donations']


In [1]:
# Random Forest Algorithm
# seeding the generated number makes our results reproducible (good for debugging)
# This module implements pseudo-random number generators for various distributions.
from csv import reader
from datetime import datetime
from math import sqrt
from matplotlib import pyplot
from random import randrange
from random import seed
from sklearn import preprocessing
from sklearn import svm, datasets
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,log_loss
from sklearn.metrics import auc
from sklearn.metrics import classification_report,confusion_matrix,roc_curve,roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import itertools
import matplotlib.pyplot as plt
import numpy as np
import numpy as npX
import seaborn as sns

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 943 entries, 0 to 942
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   tweet_id          943 non-null    int64  
 1   priority          943 non-null    float64
 2   category          943 non-null    object 
 3   postCategories_x  943 non-null    int64  
 4   eventID           943 non-null    object 
 5   eventType         943 non-null    object 
 6   matchedName       25 non-null     object 
dtypes: float64(1), int64(2), object(4)
memory usage: 51.7+ KB


In [11]:
X2 = df[['tweet_id', 'priority', 'postCategories_x']]
y2 = df['tweet_id'].truncate(after=942)

X2.shape

(943, 3)

In [12]:
### Split the dataset into 'Train' and 'Test' sets
X_train, X_test, y_train, y_test = train_test_split(X2, y2, random_state=0)

print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

X_train shape: (707, 3)
X_test shape: (236, 3)
y_train shape: (707,)
y_test shape: (236,)


In [13]:
numeric_features = ['tweet_id', 'priority']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['category','matchedName']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.2)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

ValueError: A given column is not a column of the dataframe

In [14]:
from sklearn import set_config
set_config(display='diagram')
clf