# Useful pandas snips of code

In [None]:
# Suppress future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Data explorations


In [None]:
# Box plot

#--Checking Outliers
plt.figure(figsize=(20, 20))
pos = 1
for i in df.columns:
    if df[i].dtype != 'object':
        plt.subplot(3, 4, pos)
        ax = sns.boxplot(df[i])
        ax.set_title(i)
        pos += 1
plt.show()

### Histograms for categorical

In [None]:
### adjust the line below
plt.hist(df['class'])
# Or you can do in this way
#df["class"].value_counts().plot(title="class", kind='bar')
plt.show()

# Feature selection

In [None]:
from sklearn.feature_selection import VarianceThreshold
# Remove features with low variance
sel = VarianceThreshold(threshold=(.8 * (1 - .8))).set_output(transform='pandas')
df_variance = sel.fit_transform(df)
df_variance.head()

In [None]:
# Select top 2 features based on mutual info regression
from sklearn.feature_selection import SelectKBest, mutual_info_regression
selector = SelectKBest(mutual_info_regression, k=2).set_output(transform='pandas')
df_kbest = selector.fit_transform(X, y)
df_kbest.head()

In [None]:
# Remove highly correlated features

# Correlation matrix
corr_matrx = df.corr()

corr_threshold = 0.95
upper = corr_matrx.where(
np.triu(np.ones(corr_matrx.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > corr_threshold)]
print(to_drop)

## Outliers

In [None]:
def remove_outlier(df_in, col_name):
    q1 = df_in[col_name].quantile(0.25)
    q3 = df_in[col_name].quantile(0.75)
    iqr = q3-q1  # Interquartile range
    fence_low = q1-1.5*iqr
    fence_high = q3+1.5*iqr
    df_out = df_in.loc[(df_in[col_name] > fence_low) &
                       (df_in[col_name] < fence_high)]
    return df_out


## Classification

In [None]:
# Some useful import
    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, cohen_kappa_score
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV


In [None]:
# Cross validation
grid = {
    'max_depth': [4, 5, 6, 7, 8, 10, 14],
    'criterion': ['gini', 'entropy'],
}

cv = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=grid)
cv = cv.fit(x_train, y_train)


In [None]:
# For SVM
# N.B. very slow
grid_svc = [{'kernel': ['rbf'],
                    'gamma': [1e-3, 1e-4],
                    'C': [1, 10, 100, 1000],
                    },
                   {'kernel': ['linear'],
                    'C': [1, 10, 100, 1000],
                    },
                   ]


cv = GridSearchCV(estimator=SVC(), param_grid=grid_svc)
cv = cv.fit(x_train, y_train)


In [None]:
grid_knn = [{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}]


cv = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=grid_knn)
cv = cv.fit(x_train, y_train)

In [None]:
# Confusion matrix with values
sns.heatmap(confusion_matrix(y_test, y_pred, labels=df.language.unique()),
            annot=True, xticklabels=df.language.unique(), yticklabels=df.language.unique(), fmt='g')


# Clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

n_cluster_range = range(2,10)
distortions = []
silhouette_scores = []

for n_clust in n_cluster_range:

    km = KMeans(n_clusters = n_clust)
    
    
    y_km = km.fit_predict(X)
    distortion = km.inertia_
    silhouette = silhouette_score(X,y_km)
    
    distortions.append(distortion)
    silhouette_scores.append(silhouette)
    
fig, ax1 = plt.subplots()

color = 'tab:red'
ax1.set_xlabel('Number of clusters')
ax1.set_ylabel('Inertia', color=color)
ax1.plot(n_cluster_range, distortions, color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()  

color = 'tab:blue'
ax2.set_ylabel('Silhouette scores', color=color)  
ax2.plot(n_cluster_range, silhouette_scores, color=color)
ax2.tick_params(axis='y', labelcolor=color)
ax2.set_ylim(0,1) 

fig.tight_layout() 
plt.show()

In [None]:
good_k = #k chosen before via elbow method
km = KMeans(n_clusters=good_k, 
            init='k-means++', 
            n_init=10, 
            max_iter=300,
            tol=1e-04,
            )
y_km = km.fit_predict(X)

#Silhouette score of the best parameter choice
print(silhouette_score(X,y_km))

## Pairplot after clustering

In [None]:
df_test = X.assign(Class=y_km)

sns.pairplot(data=df_test, hue="Class");

# Association rules

In [None]:
# Imports
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Create a dataset from a txt file with a line as single transaction
transactions = []
data = []

with open("./online_retail_red.txt") as f:
    for line in f.readlines():
        transaction = []
        item_dic = {}
        for item in line.split(";"):
            item_dic.setdefault(item, 1)
            transaction.append(str.strip(item))
        transactions.append(transaction)
        data.append(item_dic)
# Create our transcation number
df = pd.DataFrame(data)

df.head(10)

## Discovery of frequent itemsets

In [None]:
# Requirements
min_itemsets = 8
min_item_in_itemset = 2

# "Reasonable" range
support_range = np.arange(0.1, 0.01, -0.01)

In [None]:
min_support = 0

    frequent_itemsets = None
    itemsets_above_threshold = 0

for s_value in support_range:

    print(f"Trying support value {s_value:.2f}")

    frequent_itemsets = apriori(df, min_support = s_value, use_colnames = True)

    # Calculate the number of itemsets that contain at least `min_item_in_itemset` items
    itemsets_above_threshold = sum([len(itemset) >= min_item_in_itemset for itemset in frequent_itemsets.itemsets])
    
    if itemsets_above_threshold >= min_itemsets:
        min_support = s_value
        break

if min_support == 0:
    print("No itemset found! Try again with a bigger range!")
else:
    print(f"I've selected min_support = {min_support:.2f}, which produced {len(frequent_itemsets)} itemsets, {itemsets_above_threshold} of which had more than {min_item_in_itemset} items")

## Discovery of rules

In [None]:
# Threshold
min_rules = 10

# "Reasonable" range
confidence_range = np.arange(1, 0.01, -0.01)


In [None]:
min_confidence = 0

for c_value in confidence_range:

    rules = association_rules(
        frequent_itemsets, metric="confidence", min_threshold=c_value)

    if len(rules) >= min_rules:
        min_confidence = c_value
        break

print(
    f'Metric: "confidence" - min_metric: {min_confidence:.4f} - Number of rules: {len(rules)}')


In [None]:
sorted_rules = rules.sort_values(
    by=['confidence', 'support'], ascending=False).reset_index(drop=True)
sorted_rules


In [None]:
# Create a scatter plot of the sorted rules
fig = sorted_rules.plot.scatter(x = 'confidence', y = 'support', title = 'Association Rules');

# Iterate over all the rules and annotate them with their index
for i in range(len(sorted_rules)):
    fig.annotate(text = i, xy = (sorted_rules['confidence'][i], sorted_rules['support'][i]))

# Random

In [None]:
# Apply a string function (lab5)
df0.loc[:, "Description"] = df0.loc[:, "Description"].str.strip()

In [None]:
# Extract row with null values
df0[df0.isnull().any(axis=1)]
# Extract row that as null value on just one column
df0[df0["InvoiceNo"].isnull()]

In [None]:
# Count number of rows after conditioning
print("Number of credit transcation, ", df1[df1["InvoiceNo"].str.contains("C")].shape[0])