In [None]:
import numpy as np

# Pandas

In [None]:
import pandas as pd

df = pd.read_csv('data.csv', index_col='col1', parse_dates=['col2'], na_values=['N/A']) # read from csv
df = pd.DataFrame({'col1': [1, 2, 3], 'col2': [4, 5, 6]}) # create from dictionary

pd.concat([df1, df2], axis=0) # concatenate rows
pd.merge(df1, df2, on='col1', how='inner') # merge by column

df.set_index('col1', inplace=True) # set index

df.rename(columns={'col1': 'new_col1', 'col2': 'new_col2'}, inplace=True) # rename columns

df.drop(['col1', 'col2'], axis=1, inplace=True) # drop columns
df.drop(df.columns[1], axis=1) # drop unnamed columns
df.drop([0, 1], axis=0, inplace=True) # drop rows

df.duplicated() # check for duplicate rows
df.drop_duplicates(inplace=True) # drop duplicate rows
df.drop_duplicates(subset=['col1', 'col2'], keep='last', inplace=True) # drop rows with duplicate values in specified columns, keep last occurrence

df.head() # first 5 rows
df.tail() # last 5 rows
df.sample() # random row
df.sample(10) # 10 random rows

df.shape # (rows, columns)
df.columns # column names
df.index # row names

df.info() # column names, data types, memory usage
df.describe() # summary statistics for numeric columns

df['col1'] # get a column
df[['col1', 'col2']] # get multiple columns
df[['col1', 'col2']].values # get a column as a numpy array

df.iloc[0] # get a row by index
df.iloc[0:5] # get multiple rows by index
df.iloc[0:5, 0:2] # get multiple rows and columns by index
df.iloc[0:5, 0:2].values # get multiple rows and columns by index as a numpy array

df.loc[0] # get a row by index
df.loc[0:5] # get multiple rows by index
df.loc[0:5, 'col1':'col2'] # get multiple rows and columns by index
df.loc[0:5, ['col1', 'col2']] # get multiple rows and columns by index
df.loc[0:5, 'col1':'col2'].values # get multiple rows and columns by index as a numpy array

df[df.col1 > 0] # filter rows
df[(df.col1 > 0) & (df.col2 < 0)] # filter rows
df[(df.col1 > 0) | (df.col2 < 0)] # filter rows
df.query('col1 > 0') # filter rows
df.query('col1 > 0 & col2 < 0') # filter rows
df.query('col1 > 0 | col2 < 0') # filter rows

df.groupby(['col1', 'col2'])['col3'].mean() # group by multiple columns and aggregate
df.groupby('col1')['col2'].agg(['mean', 'count']) # group by column and aggregate
df.groupby('col1').agg({'col2': 'mean', 'col3': 'count'}) # group by column and aggregate

df.sort_values('col1', ascending=False) # sort by column
df.sort_values(['col1', 'col2'], ascending=[False, True]) # sort by multiple columns
df.sort_index(ascending=False) # sort by index

df['col1'].value_counts(normalize=True, sort=False) # count unique values as percentages, don't sort

df['col1'].unique() # get unique values
df['col1'].nunique() # count unique values

df['col1'].isna() # check for null values

df['col1'].fillna(0) # fill null values
df['col1'].fillna(method=...) # fill null values

df['col1'].astype('int') # convert data type
df['col1'].astype('float') # convert data type
df['col1'].astype('str') # convert data type
df['col1'].astype('category') # convert data type

df['col1'].replace(0, 1) # replace values
df['col1'].replace({0: 1, 1: 0}) # replace values

df['col1'].apply(lambda x: x * 2) # apply a function
df['col1'].apply(lambda x: x * 2 if x > 0 else x / 2) # apply a function

df['col1'].map({0: 'zero', 1: 'one'}) # map values

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

features_standardized = scaler.fit_transform(df.drop('target', axis=1)) # fit scaler and transform data

# in two steps
scaler.fit(df[['col1', 'col2']]) # fit scaler to data
df[['col1', 'col2']] = scaler.transform(df[['col1', 'col2']]) # transform data

# swap in RobustScaler for data with outliers

In [None]:
# outlier clipping using 1.5 * IQR rule
q1 = df['col1'].quantile(0.25)
q3 = df['col1'].quantile(0.75)
iqr = q3 - q1
df['col1'].clip(lower=q1 - 1.5 * iqr, upper=q3 + 1.5 * iqr, axis=1, inplace=True)

In [None]:
# binninng with equal frequency
df['col1'] = pd.qcut(df['col1'], q=10, labels=False, duplicates='drop')

# binninng with equal width
df['col1'] = pd.cut(df['col1'], bins=10, labels=False, duplicates='drop')

In [None]:
# iputing strategies
from sklearn.impute import KNNImputer, SimpleImputer

# more accurate but slower
imputer = KNNImputer(n_neighbors=5)

imputer = SimpleImputer(strategy='mean')

features_imputed = imputer.fit_transform(df)

In [None]:
# encoding features
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# encoding targets
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, MultiLabelBinarizer

encoder = OneHotEncoder()
features_encoded = encoder.fit_transform(df[['col1', 'col2']])
encoder.classes_
features_decoded = encoder.inverse_transform(features_encoded)

# ordinal encoding can also be performed by using .replace()
encoding = {'col1': {'a': 0, 'b': 1, 'c': 2}}
df.replace(encoding, inplace=True)

# NLP

In [None]:
# Working with strings

df['col1'].str.lower() # convert to lowercase
df['col1'].str.upper() # convert to uppercase
df['col1'].str.strip() # strip whitespace
df['col1'].str.replace('old', 'new') # replace values
df['col1'].str.split(' ') # split strings
df['col1'].str.contains('abc') # check for substring
df['col1'].str.startswith('abc') # check for substring
df['col1'].str.endswith('abc') # check for substring
df['col1'].str.extract('(\d+)', expand=False) # extract substring
df['col1'].str.findall('(\d+)').str[0] # find all substrings
df['col1'].str.count('abc') # count occurrences of substring
df['col1'].str.cat(sep=',') # concatenate strings

In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('dogs')

In [None]:
# python -m spacy download en_core_web_sm
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp('dogs')
[token.lemma_.lower().strip() for token in nlp(doc) if (not token.is_stop) 
                                                   and (not token.is_punct)
                                                   and (not token.is_digit)
                                                   and (not token.like_num)
                                                   and (token.lemma_.strip()!="") 
                                                   and (len(token.lemma_.strip())>1)]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

count_2gram = CountVectorizer(tokenizer=word_tokenize, ngram_range=(1,2), stop_words="english", vocabulary=['dog', 'cat', 'monkey'])
bag = count_2gram.fit_transform(['dog ate homework', 'monkey ate banana'])

tfidf = TfidfVectorizer(tokenizer=word_tokenize, stop_words='english', ngram_range=(1,2), max_df=0.5, min_df=3)
feature_matrix = tfidf.fit_transform(['dog ate homework', 'monkey ate banana'])

# Nearest Neighbors
from sklearn.neighbors import NearestNeighbors

dtm = pd.DataFrame(feature_matrix.todense(), columns=tfidf.get_feature_names_out())
nn = NearestNeighbors(n_neighbors=10)
nn.fit(dtm)
n_dist, n_index = nn.kneighbors(tfidf.transform(['ball bounce']))
for i in n_index:
    print(df['text'][i])

# DATETIME

In [None]:
pd.to_datetime(df['col1'], format='%d-%m-%Y %I:%M %p') # convert to datetime

df['col1'].dt.year # get year


# Feature Selection

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2) # specify n_feratures to keep

pca = PCA(n_components=0.95) # specify variance to keep

features_pca = pca.fit_transform(df)

# for components that are not lnearly independent, use KernelPCA

In [None]:
from sklearn.feature_selection import chi2, f_classif, f_regression

# Modeling

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.2, random_state=1)

In [None]:
from sklearn.dummy import DummyRegressor, DummyClassifier

dummy = DummyRegressor(strategy='mean')
dummy = DummyClassifier(strategy='most_frequent')

dummy.fit(X_train, y_train)

In [None]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()

model = regressor.fit(X_train, y_train)

In [None]:
from sklearn.linear_model import Ridge, Lasso

regressor = Ridge(alpha=0.5) # punish large coefficients
regressor = Lasso(alpha=0.5) # shrink coefficients to zero

In [None]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(penalty='l2', C=1.0, solver='liblinear')

In [None]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

classifier = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)

model = classifier.fit(X_train, y_train)

model.feature_importances_

In [None]:
import xgboost as xgb

classifier = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
model = classifier.fit(X_train, y_train)

In [None]:
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier

nn = NearestNeighbors(n_neighbors=5)

nn.fit(X_train)

distances, indices = nn.kneighbors(X_test)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score, RocCurveDisplay

y_pred = classifier.predict_proba(X_test)[:,1]

auc = roc_auc_score(y_test, y_pred)

fpr, tpr, thresholds = roc_curve(y_test, y_pred)

roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=auc)

roc_display.plot()


### Alternatively ###

RocCurveDisplay.from_estimator(classifier, X_test, y_test)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

y_pred = classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

cm_display = ConfusionMatrixDisplay(cm, display_labels=classifier.classes_)


### Alternatively ###

ConfusionMatrixDisplay.from_estimator(classifier, X_test, y_test, normalize='true')

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

In [None]:
from sklearn.inspection import PartialDependenceDisplay

display = PartialDependenceDisplay.from_estimator(model, X_test, features=['col1', 'col2'])

display.plot()

In [None]:
from sklearn.inspection import permutation_importance

perm = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1)

perm_df = pd.DataFrame({'feature': X_test.columns, 'importance': perm.importances_mean})

# Clustering

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, random_state=0)

kmeans.fit(features)

kmeans.labels_

# Pipelines

In [None]:
from sklearn.compose import ColumnTransformer

col_trans = ColumnTransformer([
    ('onehot', OneHotEncoder(), ['col1', 'col2']),
    ('scale', StandardScaler(), ['col3', 'col4'])], 
    remainder='passthrough')

In [None]:
from sklearn.pipeline import make_pipeline, Pipeline

model = make_pipeline(
    StandardScaler(cols=['col1', 'col2']),
    LogisticRegression()
)

### alternatively ###
model = Pipeline([
    ('scaler', StandardScaler(cols=['col1', 'col2'])),
    ('classifier', LogisticRegression())
])

In [None]:
from sklearn.model_selection import GridSearchCV

grid = {'classifier__n_neighbors': [1, 5, 10]}

clf = GridSearchCV(model_pipeline, grid, cv=kf, scoring='accuracy', n_jobs=-1)

model = clf.fit(features, target)

In [None]:
from sklearn.model_selection import KFold, cross_val_score

pipeline = make_pipeline(scaler, classifier)
kf = KFold(n_splits=5, shuffle=True, random_state=0)
cv_results = cross_val_score(pipeline, # Pipeline
                            features, # Feature matrix
                            target, # Target vector
                            cv=kf, # Performance metric
                            scoring="accuracy", # Loss function
                            n_jobs=-1) # Use all CPU cores

cv_results.mean()