#### "Learning Data Mining with Python - Chapter 3"
This Jupyter notebook contains the python implementation of the affinity analysis and classification.

#### 1-) Affinity Analysis

In [22]:
# import Numpy package, load example txt file
import numpy as np
data = np.loadtxt(r'Chapter 1\affinity_dataset.txt')

In [23]:
# The txt file shows which product people bought in a supermarket.
# 1's show the sold products. The products are bread,milk,chesee,apple,banana.
data[:5]

array([[0., 0., 1., 1., 1.],
       [1., 1., 0., 1., 0.],
       [1., 0., 1., 1., 0.],
       [0., 0., 1., 1., 1.],
       [0., 1., 0., 0., 1.]])

In [24]:
# Column names
features = ['bread', 'milk', 'cheese', 'apples', 'bananas']

In [25]:
data_samples, data_features = data.shape
print(f"Row: {data_samples}, Column: {data_features}")

Row: 100, Column: 5


In [26]:
# Number of people purchased Apples
num_apples = 0
for i in range(len(data)):
    if data[i][3] == 1:
        num_apples += 1
print(f'Number of People bought Apples: {num_apples}')

Number of People bought Apples: 36


In [27]:
# Num of people purchased bananas
num_bananas = 0
for row1 in data:
    if row1[4] == 1:
        num_bananas += 1
print("Num of People purchased bananas: {0}".format(num_bananas))

Num of People purchased bananas: 59


In [28]:
# Num of peop. purc. apple and bananas together
num_app_ban = 0
rule_valid = rule_invalid = 0
for row2 in data:
    if row2[3]==1:
        if row2[4] == 1:
            num_app_ban += 1
            rule_valid += 1
        else:
            rule_invalid += 1
        
print('Num of Valid : {0}, Invalid: {1}'.format(rule_valid, rule_invalid))

Num of Valid : 21, Invalid: 15


In [29]:
# Rule: If people buy apples, then thew will also buy bananas 
# support: Num. of samples that the above rule occurs
# Confidence : Accuracy of the above rule.
supp = rule_valid
conf = rule_valid / num_apples
print("Support: {0}, confidence: {1:.3f}, {2:.1f}%"
      .format(supp,conf,100*conf))


Support: 21, confidence: 0.583, 58.3%


In [30]:
# Compute Support and Confidence for all possible Rules.
from collections import defaultdict
valid_rules = defaultdict(int)
invalid_rules = defaultdict(int)
num_occurences = defaultdict(int)

for sample in data:
    for premise in range(data_features):
        if sample[premise] == 1:
            num_occurences[features[premise]] += 1
            for conclusion in range(data_features):
                if premise == conclusion:
                    continue
                else:
                    if sample[conclusion] == 1:
                        valid_rules[(features[premise],features[conclusion])] +=1
                    else:
                        invalid_rules[(features[premise],features[conclusion])] +=1         
        else:
            continue
support = valid_rules 
confidence = defaultdict(float)
for prem_name, conc_name in support.keys():
    confidence[(prem_name, conc_name)] = support[(prem_name,conc_name)]/num_occurences[prem_name]
print(f"Num. of purchased items: ")
num_occurences

Num. of purchased items: 


defaultdict(int,
            {'cheese': 41,
             'apples': 36,
             'bananas': 59,
             'bread': 27,
             'milk': 46})

In [31]:
# print all the rules with their support and confidence
for name1, name2 in confidence.keys():
    print('Rule:', end=' ')
    print('If a person buys {0}, they will also buy {1}'.
         format(name1,name2))
    print('- Confidence: {0:.1f}%'
          .format(100*confidence[(name1,name2)]))
    print('- Support: {0}'.format(support[(name1,name2)]))
    print()

Rule: If a person buys cheese, they will also buy apples
- Confidence: 61.0%
- Support: 25

Rule: If a person buys cheese, they will also buy bananas
- Confidence: 65.9%
- Support: 27

Rule: If a person buys apples, they will also buy cheese
- Confidence: 69.4%
- Support: 25

Rule: If a person buys apples, they will also buy bananas
- Confidence: 58.3%
- Support: 21

Rule: If a person buys bananas, they will also buy cheese
- Confidence: 45.8%
- Support: 27

Rule: If a person buys bananas, they will also buy apples
- Confidence: 35.6%
- Support: 21

Rule: If a person buys bread, they will also buy milk
- Confidence: 51.9%
- Support: 14

Rule: If a person buys bread, they will also buy apples
- Confidence: 18.5%
- Support: 5

Rule: If a person buys milk, they will also buy bread
- Confidence: 30.4%
- Support: 14

Rule: If a person buys milk, they will also buy apples
- Confidence: 19.6%
- Support: 9

Rule: If a person buys apples, they will also buy bread
- Confidence: 13.9%
- Support: 

In [32]:
from operator import itemgetter
sorted_support = sorted(support.items(), key=itemgetter(1), reverse=True)
sorted_confidence = sorted(confidence.items(), key=itemgetter(1), reverse=True)

#### 2-) Classification

In [33]:
from sklearn.datasets import load_iris
dataset = load_iris()
dataset_row ,dataset_column = dataset.data, dataset.target
n_samples , n_features = dataset_row.shape

In [34]:
attribute_means = np.round(dataset_row.mean(axis=0),2)
attribute_means

array([5.84, 3.06, 3.76, 1.2 ])

In [35]:
dataset_d = np.array(dataset_row >= attribute_means, dtype='int')

In [36]:
from sklearn.model_selection import train_test_split
dataset_row_train, dataset_row_test, dataset_column_train, dataset_column_test = train_test_split(dataset_d,dataset_column,random_state=14)
print("There are {} training samples".format(dataset_column_train.shape))
print("There are {} testing samples".format(dataset_column_test.shape))

There are (112,) training samples
There are (38,) testing samples
