In [None]:
pip install spmf

Collecting spmf
  Downloading spmf-1.4-py3-none-any.whl (17 kB)
Installing collected packages: spmf
Successfully installed spmf-1.4


In [None]:
# Importing necessary libraries
import pandas as pd
#from spmf import Spmf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Load the dataset
df = pd.read_csv("/content/drive/MyDrive/car.data", header=None)
df.columns = ['buying', 'maintenance', 'doors', 'persons', 'lug_boot', 'safety', 'class']

# Preprocessing: Convert categorical variables to numerical
df = pd.get_dummies(df, columns=['buying', 'maintenance', 'doors', 'persons', 'lug_boot', 'safety'])

# Split data into features and target
X = df.drop(columns=['class'])
y = df['class']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define classifiers
classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB()
}

# Train and evaluate classifiers
results = {}
for clf_name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[clf_name] = accuracy

# Print results
output_file = "classifier_results.txt"
with open(output_file, 'w') as file:
    for clf_name, accuracy in results.items():
        file.write(f"{clf_name}: Accuracy - {accuracy}\n")

# Print success message
print(f"Results saved to {output_file}")

# Now you can proceed with pattern mining using SPMF library and further evaluation and comparison of algorithms.


Results saved to classifier_results.txt


In [None]:
# Load the dataset
df = pd.read_csv("/content/drive/MyDrive/car.data", header=None)
df.columns = ['buying', 'maintenance', 'doors', 'persons', 'lug_boot', 'safety', 'class']

# Convert categorical variables to numerical
df = pd.get_dummies(df, columns=['buying', 'maintenance', 'doors', 'persons', 'lug_boot', 'safety'])

# Convert DataFrame to transaction format
transactions = []
for _, row in df.iterrows():
    transaction = []
    for col in df.columns:
        if col != 'class':
            transaction.append(col + "_" + str(row[col]))
    transactions.append(transaction)

# Write transactions to input file
input_file = "input_transactions.txt"
with open(input_file, 'w') as file:
    for transaction in transactions:
        file.write(" ".join(transaction) + "\n")

# Print success message
print(f"Input transactions saved to {input_file}")

# Now you can proceed with pattern mining using SPMF library and further evaluation and comparison of algorithms.


Input transactions saved to input_transactions.txt


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler

# Load dataset
data = pd.read_csv('car.data', header=None)
data.columns = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']

# Write dataset to a new file
data.to_csv('car_input.csv', index=False)

# Handle categorical data
le = LabelEncoder()
data_encoded = data.apply(le.fit_transform)

# Split features and target variable
X = data_encoded.drop('class', axis=1)
y = data_encoded['class']

# Handle unbalanced classes
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X, y)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Initialize Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Predictions
y_pred = rf_classifier.predict(X_test)

# Model evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Further analysis can be added as per your requirement.


Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       356
           1       1.00      1.00      1.00       347
           2       1.00      0.99      0.99       385
           3       1.00      1.00      1.00       364

    accuracy                           1.00      1452
   macro avg       1.00      1.00      1.00      1452
weighted avg       1.00      1.00      1.00      1452



In [None]:
import os
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpgrowth

# Step 1: Data Preprocessing
# Load dataset
data = pd.read_csv('car.data', header=None)
data.columns = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']

# Convert categorical variables into text format
data = data.astype(str)

# Convert dataframe to transaction format
te = TransactionEncoder()
te_ary = te.fit(data.values).transform(data.values)
transaction_df = pd.DataFrame(te_ary, columns=te.columns_)

# Step 2: Generating Frequent Patterns
# Set minimum support threshold
min_support = 0.3

# Use Apriori algorithm
apriori_patterns = apriori(transaction_df, min_support=min_support, use_colnames=True)

# Use FP-Growth algorithm
fpgrowth_patterns = fpgrowth(transaction_df, min_support=min_support, use_colnames=True)

# Step 3: Comparing Algorithms (Not applicable in this approach)

# Step 4: Evaluation and Comparison (Not applicable in this approach)
# Display the patterns obtained
print("Apriori Patterns:")
print(apriori_patterns)
print("\nFP-Growth Patterns:")
print(fpgrowth_patterns)


Apriori Patterns:
     support           itemsets
0   0.500000                (2)
1   0.500000                (4)
2   0.333333              (big)
3   0.625000             (high)
4   0.625000              (low)
5   0.750000              (med)
6   0.333333             (more)
7   0.333333            (small)
8   0.700231            (unacc)
9   0.437500            (vhigh)
10  0.312500          (2, high)
11  0.312500           (2, low)
12  0.375000           (2, med)
13  0.438657         (unacc, 2)
14  0.312500          (4, high)
15  0.312500           (4, low)
16  0.375000           (4, med)
17  0.306713         (4, unacc)
18  0.333333        (high, low)
19  0.430556        (high, med)
20  0.408565      (unacc, high)
21  0.430556         (med, low)
22  0.456019       (unacc, low)
23  0.502315       (unacc, med)
24  0.354167     (unacc, vhigh)
25  0.324074    (unacc, 2, med)
26  0.304977  (unacc, med, low)

FP-Growth Patterns:
     support           itemsets
0   0.700231            (unacc)
1

  and should_run_async(code)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load dataset
data = pd.read_csv('car.data', header=None)
data.columns = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']

# Feature Engineering: Use the mined patterns or association rules as additional features

# Split features and target variable
X = data.drop('class', axis=1)
y = data['class']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model Training
# Initialize Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Predictions
y_pred = rf_classifier.predict(X_test)

# Evaluation
print("Classification Report without mined patterns/rules:")
print(classification_report(y_test, y_pred))

# Experiment with adding mined patterns/rules as additional features and repeat the training and evaluation process

# Optimization: No specific optimization demonstrated in this basic example

# Documentation and Presentation: Prepare a detailed report and presentation slides


  and should_run_async(code)


ValueError: could not convert string to float: 'med'

In [None]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, fpgrowth, fpmax
from mlxtend.preprocessing import TransactionEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/car.data', header=None)

# Set column names
data.columns = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']

# Preprocessing
# Encode categorical variables
label_encoder = LabelEncoder()
for column in data.columns:
    data[column] = label_encoder.fit_transform(data[column])

# Split data into features and target
X = data.drop('class', axis=1)
y = data['class']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Generate frequent patterns
def generate_patterns(data, algorithm, min_support):
    te = TransactionEncoder()
    te_ary = te.fit(data).transform(data)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    if algorithm == 'apriori':
        patterns = apriori(df, min_support=min_support, use_colnames=True)
    elif algorithm == 'fpgrowth':
        patterns = fpgrowth(df, min_support=min_support, use_colnames=True)
    elif algorithm == 'fpmax':
        patterns = fpmax(df, min_support=min_support, use_colnames=True)
    return patterns

# Compare algorithms
def compare_algorithms(data, min_support):
    algorithms = ['apriori', 'fpgrowth', 'fpmax']
    for algorithm in algorithms:
        print(f"Algorithm: {algorithm}")
        patterns = generate_patterns(data, algorithm, min_support)
        print(patterns)

# Perform comparison
min_support = 0.2  # You can adjust this threshold
compare_algorithms(data.values.tolist(), min_support)

# Additional steps: You can further tune parameters and evaluate algorithms using classifiers or clustering techniques.
# For example:
# - Train classifiers using frequent patterns as features
# - Evaluate classifiers' performance
# - Tune parameters such as min_support, pattern length, etc.
# - Experiment with different classifiers or clustering algorithms
# - Use cross-validation for robust evaluation

# Example: Training a Random Forest classifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


  and should_run_async(code)


Algorithm: apriori
     support      itemsets
0   0.905671           (0)
1   0.875000           (1)
2   0.979167           (2)
3   0.604167           (3)
4   0.784144        (0, 1)
5   0.884838        (0, 2)
6   0.533565        (0, 3)
7   0.854167        (1, 2)
8   0.515046        (1, 3)
9   0.590278        (2, 3)
10  0.763310     (0, 1, 2)
11  0.447917     (0, 1, 3)
12  0.519676     (0, 2, 3)
13  0.501157     (1, 2, 3)
14  0.434028  (0, 1, 2, 3)
Algorithm: fpgrowth
     support      itemsets
0   0.979167           (2)
1   0.905671           (0)
2   0.875000           (1)
3   0.604167           (3)
4   0.884838        (0, 2)
5   0.854167        (1, 2)
6   0.784144        (0, 1)
7   0.763310     (0, 1, 2)
8   0.590278        (2, 3)
9   0.533565        (0, 3)
10  0.515046        (1, 3)
11  0.519676     (0, 2, 3)
12  0.501157     (1, 2, 3)
13  0.447917     (0, 1, 3)
14  0.434028  (0, 1, 2, 3)
Algorithm: fpmax
    support      itemsets
0  0.434028  (0, 1, 2, 3)
              precision    r