In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification, make_regression
import warnings
warnings.filterwarnings('ignore')

# Generate sample data
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, random_state=42)
X_reg, y_reg = make_regression(n_samples=1000, n_features=10, noise=0.1, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

# Scale data for algorithms that need it
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("=== REGRESSION MODELS ===")

# Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train_reg, y_train_reg)
print(f"Linear Regression R²: {lin_reg.score(X_test_reg, y_test_reg):.4f}")

# Ridge Regression
ridge = Ridge(alpha=1.0, random_state=42)  # Most important: alpha
ridge.fit(X_train_reg, y_train_reg)
print(f"Ridge Regression R²: {ridge.score(X_test_reg, y_test_reg):.4f}")

# Lasso Regression
lasso = Lasso(alpha=0.1, random_state=42, max_iter=1000)  # Most important: alpha
lasso.fit(X_train_reg, y_train_reg)
print(f"Lasso Regression R²: {lasso.score(X_test_reg, y_test_reg):.4f}")

# KNN Regression
knn_reg = KNeighborsRegressor(n_neighbors=5, weights='uniform', metric='minkowski')
knn_reg.fit(X_train_reg, y_train_reg)
print(f"KNN Regression R²: {knn_reg.score(X_test_reg, y_test_reg):.4f}")

# Decision Tree Regression
tree_reg = DecisionTreeRegressor(max_depth=5, min_samples_split=5, random_state=42)
tree_reg.fit(X_train_reg, y_train_reg)
print(f"Decision Tree Regression R²: {tree_reg.score(X_test_reg, y_test_reg):.4f}")

print("\n=== CLASSIFICATION MODELS ===")

# Logistic Regression
log_reg = LogisticRegression(C=1.0, penalty='l2', random_state=42, max_iter=1000)
log_reg.fit(X_train_scaled, y_train)
print(f"Logistic Regression Accuracy: {log_reg.score(X_test_scaled, y_test):.4f}")

# KNN Classification
knn_clf = KNeighborsClassifier(n_neighbors=5, weights='uniform', metric='minkowski')
knn_clf.fit(X_train_scaled, y_train)
print(f"KNN Classification Accuracy: {knn_clf.score(X_test_scaled, y_test):.4f}")

# SVM Classification
svm_clf = SVC(C=1.0, kernel='rbf', gamma='scale', random_state=42, probability=True)
svm_clf.fit(X_train_scaled, y_train)
print(f"SVM Classification Accuracy: {svm_clf.score(X_test_scaled, y_test):.4f}")

# Naive Bayes
nb_clf = GaussianNB(var_smoothing=1e-9)
nb_clf.fit(X_train_scaled, y_train)
print(f"Naive Bayes Accuracy: {nb_clf.score(X_test_scaled, y_test):.4f}")

# LDA
lda = LinearDiscriminantAnalysis(solver='svd')
lda.fit(X_train_scaled, y_train)
print(f"LDA Accuracy: {lda.score(X_test_scaled, y_test):.4f}")

# Decision Tree Classification
tree_clf = DecisionTreeClassifier(max_depth=5, min_samples_split=5, random_state=42)
tree_clf.fit(X_train, y_train)
print(f"Decision Tree Accuracy: {tree_clf.score(X_test, y_test):.4f}")

# AdaBoost
ada = AdaBoostClassifier(n_estimators=50, learning_rate=1.0, random_state=42)
ada.fit(X_train, y_train)
print(f"AdaBoost Accuracy: {ada.score(X_test, y_test):.4f}")

# Gradient Boosting
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb.fit(X_train, y_train)
print(f"Gradient Boosting Accuracy: {gb.score(X_test, y_test):.4f}")

# Stacking Ensemble
base_learners = [
    ('log_reg', LogisticRegression(C=1.0, random_state=42)),
    ('svm', SVC(C=1.0, kernel='linear', probability=True, random_state=42)),
    ('tree', DecisionTreeClassifier(max_depth=3, random_state=42))
]

stack_clf = StackingClassifier(
    estimators=base_learners,
    final_estimator=LogisticRegression(),
    cv=5,
    passthrough=False  # Set to True for passthrough stacking
)
stack_clf.fit(X_train_scaled, y_train)
print(f"Stacking Ensemble Accuracy: {stack_clf.score(X_test_scaled, y_test):.4f}")

In [None]:
#regression outlier detection
def detect_outliers_iqr(data):
    Q1 = np.percentile(data, 25)
    Q3 = np.percentile(data, 75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return (data < lower_bound) | (data > upper_bound)

target_outliers_iqr = detect_outliers_iqr(y)
print(f"IQR outliers found: {np.sum(target_outliers_iqr)}")

def remove_regression_outliers(X, y, method='cooks', threshold=3):
    """
    Remove outliers from regression data
    Methods: 'zscore', 'iqr', 'cooks', 'residual'
    """
    X_clean, y_clean = X.copy(), y.copy()

    if method == 'zscore':
        outliers = detect_outliers_zscore(y, threshold)
    elif method == 'iqr':
        outliers = detect_outliers_iqr(y)
    elif method == 'residual':
        model = LinearRegression()
        model.fit(X, y)
        residuals = y - model.predict(X)
        outliers = detect_outliers_zscore(residuals, threshold)
    elif method == 'cooks':
        X_with_const = sm.add_constant(X)
        model_sm = sm.OLS(y, X_with_const).fit()
        influence = OLSInfluence(model_sm)
        cooks_d = influence.cooks_distance[0]
        outliers = cooks_d > 4 / len(X)

    # Remove outliers
    X_clean = X[~outliers]
    y_clean = y[~outliers]

    print(f"Removed {np.sum(outliers)} outliers using {method} method")
    print(f"Original shape: {X.shape}, Cleaned shape: {X_clean.shape}")

    return X_clean, y_clean, outliers

# Usage example
X_clean, y_clean, outliers = remove_regression_outliers(X, y, method='iqr')

In [None]:
#outlier analysis for classification problem
from sklearn.datasets import make_classification

# Generate classification data
X_clf, y_clf = make_classification(n_samples=300, n_features=2, n_redundant=0,
                                   n_clusters_per_class=1, random_state=42)
# Add outliers
X_clf[50] = [4, 4]   # Outlier in class 0
X_clf[150] = [-4, -4] # Outlier in class 1
y_clf = y_clf.astype(int)

plt.figure(figsize=(15, 5))

# Box plot for each feature by class
for i in range(X_clf.shape[1]):
    plt.subplot(1, 2, i+1)
    data_by_class = [X_clf[y_clf == j][:, i] for j in np.unique(y_clf)]
    plt.boxplot(data_by_class, labels=[f'Class {j}' for j in np.unique(y_clf)])
    plt.title(f'Feature {i+1} - Box Plot by Class')
    plt.ylabel('Feature Value')
plt.tight_layout()
plt.show()

################
def remove_classification_outliers(X, y, method='isolation_forest', **kwargs):
    """
    Remove outliers from classification data
    Methods: 'isolation_forest', 'lof', 'dbscan', 'classwise_iqr'
    """
    X_clean, y_clean = X.copy(), y.copy()

    if method == 'isolation_forest':
        contamination = kwargs.get('contamination', 0.05)
        outliers = detect_outliers_isolation_forest(X, contamination)
    elif method == 'lof':
        contamination = kwargs.get('contamination', 0.05)
        outliers = detect_outliers_lof(X, contamination)
    elif method == 'dbscan':
        eps = kwargs.get('eps', 0.5)
        min_samples = kwargs.get('min_samples', 5)
        outliers = detect_outliers_dbscan(X, eps, min_samples)
    elif method == 'classwise_iqr':
        outliers = detect_classification_outliers(X, y, method='iqr')

    # Remove outliers
    X_clean = X[~outliers]
    y_clean = y[~outliers]

    print(f"Removed {np.sum(outliers)} outliers using {method} method")
    print(f"Original shape: {X.shape}, Cleaned shape: {X_clean.shape}")

    # Check class distribution after cleaning
    unique, counts = np.unique(y_clean, return_counts=True)
    print(f"Class distribution after cleaning: {dict(zip(unique, counts))}")

    return X_clean, y_clean, outliers

# Usage examples
X_clf_clean, y_clf_clean, clf_outliers = remove_classification_outliers(
    X_clf, y_clf, method='isolation_forest', contamination=0.05
)

In [None]:
#KFold
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=25)  #KFold
results = cross_val_score(lr, X, y, cv=kfold) # defaults to accuracy
print( results.mean() )


#grid search CV
params = {'solver': ['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga'],
          'C': np.linspace(0.001, 4, 20)}
gcv = GridSearchCV(lr, param_grid=params, cv=kfold, scoring='roc_auc')

In [None]:
#Bagging,Boosting,RandomForest
est_list = [nb, dtc, knn, lr]
n_est = [10, 15, 25, 50]
scores = []
for e in tqdm(est_list):
    for n in n_est:
        bagg = BaggingClassifier(random_state=25, n_estimators=n, estimator=e )
        bagg.fit(X_train_trns, y_train)
        y_pred_prob = bagg.predict_proba(X_test_trns)
        scores.append([e, n, log_loss(y_test, y_pred_prob)])
df_scores = pd.DataFrame( scores, columns=['Estimator','B-Samples','score'] )
df_scores.sort_values('score')

gbm = GradientBoostingClassifier(random_state=25,n_estimators=n,
                                             max_depth=d, learning_rate=r)

 rf = RandomForestClassifier(random_state=25, max_features=f)

In [None]:
#stack ensembling
voting = VotingClassifier(estimators=[('TREE',dtc),('KNN',knn),('NB',nb)])

In [None]:
#clustering
wss = []
for i in range(2, 11):
    clust = KMeans(random_state=25, n_clusters=i)
    clust.fit(milk_scaled)
    wss.append([i, clust.inertia_] )                                               #silhouette_score(milk_scaled,clust.labels_)]
df_wss = pd.DataFrame( wss, columns=['clusters','wss'] )
plt.scatter(df_wss['clusters'], df_wss['wss'])
plt.plot( df_wss['clusters'], df_wss['wss'] )
plt.xlabel("Clusters")
plt.ylabel("WSS")
plt.title("Scree Plot")



epsilons = [0.2, 0.4, 0.6, 0.8, 1, 1.2]
min_pcts = [2, 3, 4, 5]
scores = []
for e in epsilons:
    for m in min_pcts:
        clust = DBSCAN(eps=e, min_samples=m)
        clust.fit(milk_scaled)
        inliers = milk_scaled.copy()
        inliers['label'] = clust.labels_
        inliers = inliers[inliers['label']!=-1]
        # len( np.unique( inliers['label'] )) are the number of clusters
        # getting formed
        if len( np.unique( inliers['label'] )) >= 2:
            scores.append([e, m, silhouette_score(inliers.iloc[:,:-1], inliers['label'])])
df_scores = pd.DataFrame(scores, columns=['eps','min','score'])
df_scores.sort_values('score', ascending=False)

In [None]:
#pca
model = pca()
results = model.fit_transform(milk_scaled,
                              col_labels=milk.columns,
                              row_labels=list(milk.index))
model.biplot(label=True,legend=True)
for i in np.arange(0, milk.shape[0] ):
    plt.text(pc_data.values[i,0],
             pc_data.values[i,1],
             list(milk.index)[i])
plt.show()

### Time Series

In [None]:
series = df['Milk']
result = seasonal_decompose(series, model='multiplicative',period=12)
result.plot()
plt.show()

y = df['Milk']
fcast = y.rolling(3,center=True).mean()
plt.plot(y, label='Original Data')
plt.plot(fcast, label='Centered Moving Average')
plt.legend(loc='best')
plt.show()

#trailing
y_train = df['Milk'].iloc[:-12]
y_test = df['Milk'].iloc[-12:]
span=5
fcast = y_train.rolling(span).mean()
MA = fcast.iloc[-1]
MA_series = pd.Series(MA.repeat(len(y_test)))
MA_fcast = pd.concat([fcast,MA_series], ignore_index=True)
rmse = root_mean_squared_error(y_test, MA_series)
plt.plot(y_train, label='Train')
plt.plot(y_test, label='Test')
plt.plot(MA_fcast, label='Rolling Average Forecast')
plt.title(f"RMSE = {rmse:.2f}")
plt.legend(loc='best')
plt.show()


#simple exponential smoothing
alpha = 0.2
ses = SimpleExpSmoothing(y_train)
fit1 = ses.fit(smoothing_level=alpha)
fcast1 = fit1.forecast(len(y_test))
y_test.plot(color="pink", label='Test')
fcast1.plot(color="purple", label='Forecast')
rmse = root_mean_squared_error(y_test, fcast1)
plt.title(f"RMSE = {rmse:.2f}")
plt.legend(loc='best')
plt.show()

In [None]:
#holt linear
alpha = 0.8
beta = 0.02
holt = Holt(y_train)
fit1 = holt.fit(smoothing_level=alpha, smoothing_trend=beta)
fcast1 = fit1.forecast(len(y_test))
y_test.plot(color="pink", label='Test')
fcast1.plot(color="purple", label='Forecast')
rmse = root_mean_squared_error(y_test, fcast1)
plt.title(f"RMSE = {rmse:.2f}")
plt.legend(loc='best')
plt.show()

#holt exp
holt = Holt(y_train, exponential=True)
def holt_exp(alpha, beta):
    fit1 = holt.fit(smoothing_level=alpha, smoothing_trend=beta)
    fcast1 = fit1.forecast(len(y_test))
    y_test.plot(color="pink", label='Test')
    fcast1.plot(color="purple", label='Forecast')
    rmse = root_mean_squared_error(y_test, fcast1)
    plt.title(f"RMSE = {rmse:.2f}, alpha = {alpha:.2f}, beta = {beta:.2f}")
    plt.legend(loc='best')
    plt.show()
widgets.interact(holt_exp, alpha=(0.01, 1, 0.01), beta=(0.01, 1, 0.01))


#damped meth
def damped(alpha, beta, phi, exponentiality, dampness):
    holt = Holt(y_train, exponential=exponentiality, damped_trend=dampness)
    fit1 = holt.fit(smoothing_level=alpha, smoothing_trend=beta, damping_trend=phi)
    fcast1 = fit1.forecast(len(y_test))
    y_test.plot(color="pink", label='Test')
    fcast1.plot(color="purple", label='Forecast')
    rmse = root_mean_squared_error(y_test, fcast1)
    plt.title(f"RMSE = {rmse:.2f}, alpha = {alpha:.2f}, beta = {beta:.2f}")
    plt.legend(loc='best')
    plt.show()
widgets.interact(damped, alpha=(0.01, 1, 0.01), beta=(0.01, 1, 0.01),phi=(0.01, 1, 0.01),
                 exponentiality=[True, False], dampness=[True, False])


#holt winter
def hw(alpha, beta, gamma, seasonality, periods=12):
    holt = ExponentialSmoothing(y_train, trend='add', seasonal=seasonality,seasonal_periods=periods)
    fit1 = holt.fit(smoothing_level=alpha, smoothing_trend=beta, smoothing_seasonal=gamma)
    fcast1 = fit1.forecast(len(y_test))
    y_test.plot(color="pink", label='Test')
    fcast1.plot(color="purple", label='Forecast')
    rmse = root_mean_squared_error(y_test, fcast1)
    plt.title(f"RMSE={rmse:.2f}, alpha={alpha:.2f}, beta={beta:.2f}, gamma={gamma:.2f}")
    plt.legend(loc='best')
    plt.show()
widgets.interact(hw, alpha=(0.01, 1, 0.01), beta=(0.01, 1, 0.01),gamma=(0.01, 1, 0.01),
                 seasonality=['add', 'mul'])

In [None]:
#dickey fuller
ord_1_diff = df['Value'].diff() # does first order differencing
ord_1_diff = ord_1_diff.dropna()
result = adfuller(ord_1_diff, maxlag=10)
print("P-Value =", result[1])
if result[1] < 0.05:
    print("Time Series is Stationary")
else:
    print("Time Series is not Stationary")

In [None]:
df = pd.read_csv("FRED-NROUST.csv")
y_train = df['Value'].iloc[:-8]
y_test = df['Value'].iloc[-8:]
y_train.shape, y_test.shape

model = ARIMA(y_train,order=(1,1,1))
model_fit = model.fit()
#print('Coefficients: %s' % model_fit.params)
predictions = model_fit.predict(start=len(y_train), end=len(y_train)+len(y_test)-1)
y_test.plot(color="pink", label='Test')
predictions.plot(color="purple", label='Forecast')
rmse = root_mean_squared_error(y_test, predictions)
plt.title(f"RMSE={rmse:.5f}")
plt.legend(loc='best')
plt.show()

#seasonal ARIMA
def sarima(p,d,q, P, D, Q, S):
    model = ARIMA(y_train,order=(p,d,q),seasonal_order=(P, D, Q, S))
    model_fit = model.fit()
    predictions = model_fit.predict(start=len(y_train), end=len(y_train)+len(y_test)-1)
    y_test.plot(color="pink", label='Test')
    predictions.plot(color="purple", label='Forecast')
    rmse = root_mean_squared_error(y_test, predictions)
    plt.title(f"RMSE={rmse:.5f}")
    plt.legend(loc='best')
    plt.show()
widgets.interact( sarima, p=(0,5,1), d=(0,5,1), q=(0,5,1),
                 P=(0,5,1), D=(0,5,1), Q=(0,5,1) , S=12 )

In [None]:
#assoc rules
fp_df = pd.read_csv('Faceplate.csv',index_col=0)
fp_df.head()
fp_df = fp_df.astype(bool) # convert to boolean for compatibility of apriori( )
itemsets = apriori(fp_df, min_support=0.2, use_colnames=True)
rules = association_rules(itemsets, metric='confidence',  min_threshold=0.6)
rules = rules[['antecedents','consequents','support', 'confidence','lift']]
rules
def create_rules(min_supp, conf_thres):
    itemsets = apriori(fp_df, min_support=min_supp, use_colnames=True)
    rules = association_rules(itemsets, metric='confidence',  min_threshold=conf_thres)
    rules = rules[['antecedents','consequents','support', 'confidence','lift']]
    return rules
widgets.interact(create_rules, min_supp=(0.01, 1, 0.01),
                 conf_thres=(0.01, 1, 0.01))