https://www.kaggle.com/code/ryanholbrook/creating-features

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)




  plt.style.use("seaborn-whitegrid")


In [2]:
accidents = pd.read_csv("/kaggle/input/us-accidents/US_Accidents_March23.csv")

In [3]:
autos = pd.read_csv("/kaggle/input/automobile-dataset/Automobile_data.csv")

In [4]:
customer = pd.read_csv("/kaggle/input/ibm-watson-marketing-customer-value-data/WA_Fn-UseC_-Marketing-Customer-Value-Analysis.csv")


The concrete file cannot be read with read_csv, as it gives an encoding error.  Thus the need to install the excel reading kit (xlrd)

In [8]:
!pip install xlrd


Collecting xlrd
  Downloading xlrd-2.0.1-py2.py3-none-any.whl.metadata (3.4 kB)
Downloading xlrd-2.0.1-py2.py3-none-any.whl (96 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.5/96.5 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: xlrd
Successfully installed xlrd-2.0.1


In [12]:

concrete = pd.read_excel("/kaggle/input/concrete-comprehensive-strength/Concrete_Data.xls")

In [None]:
autos.columns.tolist()

In [None]:
autos[["stroke", "bore"]].head()

In [None]:
print(autos.dtypes)

Get the indices where there is an unknown value, which cannot be used in the calculations.

In [None]:
unknown_strokes = autos.index[autos['stroke'] == "?"].tolist()

In [None]:
unknown_bores = autos.index[autos['bore'] == "?"].tolist()

In [None]:
invalid_auto_data = list(set(unknown_bores + unknown_strokes))
print(invalid_auto_data)

In [None]:
autos = autos.drop(invalid_auto_data)
autos["stroke"] = autos["stroke"].astype(float)
autos["bore"] = autos["bore"].astype(float)

In [None]:
autos["stroke_ratio"] = autos.stroke / autos.bore

print(autos.dtypes)

In [None]:
autos[["stroke", "bore", "stroke_ratio", "num-of-cylinders"]].head()

hah, the cylinders column is strings names of numbers...
https://stackoverflow.com/questions/493174/is-there-a-way-to-convert-number-words-to-integers

In [None]:
!pip install word2number

In [None]:
from word2number import w2n

print(w2n.word_to_num(autos["num-of-cylinders"][0]))

In [None]:
autos["num-of-cylinders"] = autos.apply(lambda x: w2n.word_to_num(x["num-of-cylinders"]), axis=1)

In [None]:
autos["num-of-cylinders"].head()

In [None]:
# This does not work, see above...
# autos["num-of-cylinders"] = w2n.word_to_num(autos["num-of-cylinders"])

In [None]:
autos["displacement"] = (
    np.pi * ((0.5 * autos.bore) ** 2) * autos.stroke * autos["num-of-cylinders"]
)

In [None]:
autos["displacement"].head()

In [None]:
print(accidents.dtypes)

In [None]:
# If the feature has 0.0 values, use np.log1p (log(1+x)) instead of np.log
accidents["LogWindSpeed"] = accidents["Wind_Speed(mph)"].apply(np.log1p)

# Plot a comparison
fig, axs = plt.subplots(1, 2, figsize=(8, 4))
sns.kdeplot(accidents["Wind_Speed(mph)"], fill=True, ax=axs[0])
sns.kdeplot(accidents.LogWindSpeed, fill=True, ax=axs[1]);

In [None]:
roadway_features = ["Amenity", "Bump", "Crossing", "GiveWay",
    "Junction", "NoExit", "Railway", "Roundabout", "Station", "Stop",
    "TrafficCalming", "TrafficSignal"]
accidents["RoadwayFeatures"] = accidents[roadway_features].sum(axis=1)

accidents[roadway_features + ["RoadwayFeatures"]].head(10)

In [13]:
print(concrete.dtypes)

Cement (component 1)(kg in a m^3 mixture)                float64
Blast Furnace Slag (component 2)(kg in a m^3 mixture)    float64
Fly Ash (component 3)(kg in a m^3 mixture)               float64
Water  (component 4)(kg in a m^3 mixture)                float64
Superplasticizer (component 5)(kg in a m^3 mixture)      float64
Coarse Aggregate  (component 6)(kg in a m^3 mixture)     float64
Fine Aggregate (component 7)(kg in a m^3 mixture)        float64
Age (day)                                                  int64
Concrete compressive strength(MPa, megapascals)          float64
dtype: object


Borrow some code from a previous tutorial to rename the columns in a more useful fashion

In [14]:
new_column_names = concrete.columns.values.tolist()
tutorial_column_names = ["Cement","BlastFurnaceSlag","FlyAsh","Water",
                         "Superplasticizer","CoarseAggregate","FineAggregate",
                         "Age","CompressiveStrength"]
columns_dict = dict(zip(new_column_names, tutorial_column_names))

concrete.rename(columns=columns_dict, inplace=True)

concrete.keys()

Index(['Cement', 'BlastFurnaceSlag', 'FlyAsh', 'Water', 'Superplasticizer',
       'CoarseAggregate', 'FineAggregate', 'Age', 'CompressiveStrength'],
      dtype='object')

In [15]:
components = [ "Cement", "BlastFurnaceSlag", "FlyAsh", "Water",
               "Superplasticizer", "CoarseAggregate", "FineAggregate"]
concrete["Components"] = concrete[components].gt(0).sum(axis=1)

concrete[components + ["Components"]].head(10)

Unnamed: 0,Cement,BlastFurnaceSlag,FlyAsh,Water,Superplasticizer,CoarseAggregate,FineAggregate,Components
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,5
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,5
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,5
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,5
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,5
5,266.0,114.0,0.0,228.0,0.0,932.0,670.0,5
6,380.0,95.0,0.0,228.0,0.0,932.0,594.0,5
7,380.0,95.0,0.0,228.0,0.0,932.0,594.0,5
8,266.0,114.0,0.0,228.0,0.0,932.0,670.0,5
9,475.0,0.0,0.0,228.0,0.0,932.0,594.0,4
