# ENVIRONMENT

In [None]:
from statistics import harmonic_mean

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.metrics import (classification_report, confusion_matrix,
                             roc_auc_score)
from sklearn.model_selection import (GridSearchCV, RepeatedStratifiedKFold,
                                     cross_val_score, train_test_split)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

# suppress scientific notation in Pandas
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
pd.set_option("precision", 3)
pd.set_option("display.max_colwidth", None)

plt.rcParams["figure.figsize"] = [12, 10]
plt.rcParams["figure.dpi"] = 150

sns.set()
sns.set_context("notebook", rc={"lines.linewidth": 2.5})
sns.set_style("whitegrid")

import warnings

warnings.filterwarnings("ignore")

RANDOM_STATE = 1


rnd = lambda x: round(x, 2)


## Market Basket Analysis Examples

In [None]:
# Generating rules with itertools

from itertools import permutations
# Extract unique items.
flattened = [item for transaction in transactions for item in transaction]
items = list(set(flattened))

# Compute and print rules.
rules = list(permutations(items, 2))
print(rules)

# Print the number of rules
print(len(rules))

In [None]:
# Import the association rules function
from mlxtend.frequent_patterns import association_rules
from mlxtend.frequent_patterns import apriori
# Compute frequent itemsets using the Apriori algorithm
frequent_itemsets = apriori(onehot, min_support = 0.001,
max_len = 2, use_colnames = True)
# Compute all association rules for frequent_itemsets
rules = association_rules(frequent_itemsets,
metric = "lift",
min_threshold = 1.0)

In [None]:
# Preparing the data
from mlxtend.preprocessing import TransactionEncoder

# Instantiate transaction encoder
encoder = TransactionEncoder().fit(transactions)

# One-hot encode itemsets by applying fit and transform
onehot = encoder.transform(transactions)

# Convert one-hot encoded data to DataFrame
onehot = pd.DataFrame(onehot, columns = encoder.columns_)
print(onehot)

# Computing support for single items
print(onehot.mean())

# Computing support for multiple items
import numpy as np
# Define itemset that contains fiction and poetry
onehot['fiction+poetry'] = np.logical_and(onehot['fiction'],onehot['poetry'])
print(onehot.mean())

In [None]:
# Preparing the data
from mlxtend.preprocessing import TransactionEncoder
import pandas as pd
# Split library strings into lists
libraries = data['Library'].apply(lambda t: t.split(','))
# Convert to list of lists
libraries = list(libraries)
# One-hot encode books
books = TransactionEncoder().fit(libraries).transform(libraries)
# Convert one-hot encoded data to DataFrame
books = pd.DataFrame(books, columns = encoder.columns_)

# Computing support.
supportHG = np.logical_and(books['Hunger'],books['Gatsby']).mean()
supportH = books['Hunger'].mean()
supportG = books['Gatsby'].mean()

# Compute and print confidence and lift.
confidence = supportHG / supportH
lift = supportHG / (supportH * supportG)

# Print results.
print(supportG, confidence, lift)

In [None]:
# Apriori implementation
# Import Apriori algorithm
from mlxtend.frequent_patterns import apriori
# Load one-hot encoded novelty gifts data
onehot = pd.read_csv('datasets/online_retail_onehot.csv')
# Print header.
print(onehot.head())

# Compute frequent itemsets
frequent_itemsets = apriori(onehot, min_support = 0.0005,
max_len = 4, use_colnames = True)
# Print number of itemsets
print(len(frequent_itemsets))

# Print itemsets
print(frequent_itemsets.head())

In [None]:
How to compute association rules
# Import Apriori algorithm
from mlxtend.frequent_patterns import apriori, association_rules
# Load one-hot encoded novelty gifts data
onehot = pd.read_csv('datasets/online_retail_onehot.csv')
# Apply Apriori algorithm
frequent_itemsets = apriori(onehot,
use_colnames=True,
min_support=0.0001)
# Compute association rules
rules = association_rules(frequent_itemsets,
metric = "support",
min_threshold = 0.0)

# The importance of pruning
# Print the rules.
print(rules)

# Print the frequent itemsets.
print(frequent_itemsets)

# Compute association rules
rules = association_rules(frequent_itemsets,
metric = "support",
min_threshold = 0.001)
# Print the rules.
print(rules)

# Exploring the set of rules
print(rules.columns)

print(rules[['antecedents','consequents']])

# Pruning with other metrics
# Compute association rules
rules = association_rules(frequent_itemsets,
metric = "antecedent support",
min_threshold = 0.002)
# Print the number of rules.
print(len(rules))

In [None]:
# Generating a heatmap

In [None]:
# scatterplot Support versus confidence
# What can we learn from scatterplots?
    # Identify natural thresholds in data.
    # Not possible with heatmaps or other visualizations.
    # Visualize entire dataset.
    # Not limited to small number of rules.
    # Use  ndings to prune.
    # Use natural thresholds and pa erns to prune.

In [None]:
# parallel coordinate plots

In [None]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

df = pd.read_excel('http://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx')
df.head()

#  There is a little cleanup, we need to do. First, some of the descriptions have spaces that need to be removed. We’ll also drop the rows that don’t have invoice numbers and remove the credit transactions (those with invoice numbers containing C).
df['Description'] = df['Description'].str.strip()
df.dropna(axis=0, subset=['InvoiceNo'], inplace=True)
df['InvoiceNo'] = df['InvoiceNo'].astype('str')
df = df[~df['InvoiceNo'].str.contains('C')]

# After the cleanup, we need to consolidate the items into 1 transaction per row with each product 1 hot encoded. For the sake of keeping the data set small, I’m only looking at sales for France. However, in additional code below, I will compare these results to sales from Germany. Further country comparisons would be interesting to investigate.
basket = (df[df['Country'] =="France"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

# There are a lot of zeros in the data but we also need to make sure any positive values are converted to a 1 and anything less the 0 is set to 0. This step will complete the one hot encoding of the data and remove the postage column (since that charge is not one we wish to explore):

def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

basket_sets = basket.applymap(encode_units)
basket_sets.drop('POSTAGE', inplace=True, axis=1)

#  Now that the data is structured properly, we can generate frequent item sets that have a support of at least 7% (this number was chosen so that I could get enough useful examples):
frequent_itemsets = apriori(basket_sets, min_support=0.07, use_colnames=True)


# The final step is to generate the rules with their corresponding support, confidence and lift:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules.head()

# That’s all there is to it! Build the frequent items using apriori then build the rules with association_rules .

# Now, the tricky part is figuring out what this tells us. For instance, we can see that there are quite a few rules with a high lift value which means that it occurs more frequently than would be expected given the number of transaction and product combinations. We can also see several where the confidence is high as well. This part of the analysis is where the domain knowledge will come in handy. Since I do not have that, I’ll just look for a couple of illustrative examples.

# We can filter the dataframe using standard pandas code. In this case, look for a large lift (6) and high confidence (.8):
rules[ (rules['lift'] >= 6) &
       (rules['confidence'] >= 0.8) ]

# In looking at the rules, it seems that the green and red alarm clocks are purchased together and the red paper cups, napkins and plates are purchased together in a manner that is higher than the overall probability would suggest. At this point, you may want to look at how much opportunity there is to use the popularity of one product to drive sales of another. For instance, we can see that we sell 340 Green Alarm clocks but only 316 Red Alarm Clocks so maybe we can drive more Red Alarm Clock sales through recommendations?

basket['ALARM CLOCK BAKELIKE GREEN'].sum()
basket['ALARM CLOCK BAKELIKE RED'].sum()

# What is also interesting is to see how the combinations vary by country of purchase. Let’s check out what some popular combinations might be in Germany:

basket2 = (df[df['Country'] =="Germany"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

basket_sets2 = basket2.applymap(encode_units)
basket_sets2.drop('POSTAGE', inplace=True, axis=1)
frequent_itemsets2 = apriori(basket_sets2, min_support=0.05, use_colnames=True)
rules2 = association_rules(frequent_itemsets2, metric="lift", min_threshold=1)

rules2[ (rules2['lift'] >= 4) &
        (rules2['confidence'] >= 0.5)]

✅⭐👍👎

### Scenario 1

One of the most critical factors in customer relationship management that directly affects a company’s long-term profitability is understanding its customers. When a company can better understand its customer characteristics, it is better able to target products and marketing campaigns for customers, resulting in better profits for the company in the long term.

 

You are an analyst for a telecommunications company that wants to better understand the characteristics of its customers. You have been asked to perform a market basket analysis to analyze customer data to identify key associations of your customer purchases, ultimately allowing better business and strategic decision-making.

 

### Scenario 2

One of the most critical factors in patient relationship management that directly affects a hospital’s long-term cost effectiveness is understanding its patients and the conditions leading to hospital admissions. When a hospital can better understand its patients’ characteristics, it is better able to target treatment to patients, resulting in more effective cost of care for the hospital in the long term.

 

You are an analyst for a hospital that wants to better understand the characteristics of its patients. You have been asked to perform a market basket analysis to analyze patient data to identify key associations of your patients, ultimately allowing better business and strategic decision-making for the hospital.


- I'm very frustrated and task 3. I cannot find the information needed to create these 119/120 columns. I have the rows. I honestly have no clue what I'm missing. I cannot find anything in the course materials that is helpful with this dataset.
    - Hello Megan, Using R, the data needs to be converted to factor and transformed into a transactional dataset before using the Apriori algorithm Reminder. Please look up "Transactionalizing the online data frame" in chapter 2 of the datacamp material (Market Basket Analysis in R)

- I'm working on Task 3 teleco MBA , does anyone have any tips or sources for how to clean the data? I understand we have to remove the blank rows which brings the count down to 7501 but I can't figure out how or why we are removing a column.
    - Please see some tips below: 1. Read dataset into R 2. Explore data structure 3. Check for missing values 4. Remove empty rows or columns 5. Change data to factors 6. Verify that missing values are removed 7. Create a list 8. Create a dataframe for use with apriori 9.
    - Your original mba data sets contained character vectors with categorical variables. Transforming your dataset from character vector to factor vector will create a suitable data structure for mba analysis. I encourage you to read more about character vector and factor vector, how to factor variables in R. 
    
- I'm currently working on Task 3 and find the instructions for the task to be very vague and the datacamp material for basket analysis to be inadequate. What have others found helpful both in terms of finding the proper Python packages to use to satisfy the Task, but also getting a clear explanation for what is expected for the Task 3 analysis?
    - Hello Nathan, Tips: 1. Follow the task overview and watch Video Resources in course “Announcements” 2. Schedule time with your CI if further clarifications are needed 3. Don’t forget to download either the teleco_market_basket.csv or medical_market_basket.csv from the Data Files and Associated Data Dictionary Files in task 3 4. Data transformation (cleaned data for analysis should be 7501 rows and 119 columns or items) 5. Joining the python community may help
    
- Hi cohort! - the "teleco_market_basket" csv for Task 3 seems to have a return resulting in a blank row b/t observations. Was that intentional, or is it just something we should adjust for on loading?
    - This is intentional for the dataset provided.

- Import Info: Please download the Market Basket Analysis or D212 task 3 dataset from the “Data Sets and Associated Data Dictionaries link in the introduction paragraph of Task 3. Market Basket Analysis must use transactional dataset to complete this task. 

# Part I: Research Question

## A.  Describe the purpose of this data mining report by doing the following:

### 1.  Propose one question relevant to a real-world organizational situation that you will answer using market basket analysis.

    - One relevant question answered by using market basket analysis is
        - Gibson Telecom wants to analyze customer data to identify key associations of your customer purchases, ultimately allowing better business and strategic decision-making
        
    - One plan to reduce churn is to offer customers discounts on items of interest. The execu tive team is not sure which items customers typically buy together and has reached out to you for help. You have been asked to analyze the data set to explore data on the purchase habits of customers .

### 2.  Define one goal of the data analysis. Ensure that your goal is reasonable within the scope of the scenario and is represented in the available data.
    - One data analysis goal is
        - 
    - Identify services/add-ons frequently purchased together and construct customer service/add-on recommendation engine from the findings
        - Identify items frequently purchased together
    - Upsell products
    - Cross-sell products
    - Improve product recommendations
    - offer items as a "tech bundle" or incentive to sign up for another service
    - offer items for completing a survey

# Part II: Market Basket Justification

## B.  Explain the reasons for using market basket analysis by doing the following:

### 1.  Explain how market basket analyzes the selected dataset. Include expected outcomes.
- Market basket analyzes the selected dataset by
    - 
- The expected outcome is
    - 
- Standard procedure for market basket analysis.
    - 1. Generate large set of rules.
        - Number of rules grows exponentially in number of items. Most rules are not useful.
    - 2. Filter rules using metrics.
        - Must apply initial round of filtering using Apriori algorithm
    - 3. Apply intuition and common sense.
- https://pbpython.com/market-basket-analysis.html    
    - Association analysis is relatively light on the math concepts and easy to explain to non-technical people. In addition, it is an unsupervised learning tool that looks for hidden patterns so there is limited need for data prep and feature engineering. It is a good start for certain cases of data exploration and can point the way for a deeper dive into the data using other approaches.

### 2.  Provide one example of transactions in the dataset.

### 3.  Summarize one assumption of market basket analysis.
- One assumption of market basket analysis is

# Part III: Data Preparation and Analysis

## C.  Prepare and perform market basket analysis by doing the following:

### 1.  Transform the dataset to make it suitable for market basket analysis. Include a copy of the cleaned dataset.
- https://pbpython.com/market-basket-analysis.html
    - One final note, related to the data. This analysis requires that all the data for a transaction be included in 1 row and the items should be 1-hot encoded. 

### 2.  Execute the code used to generate association rules with the Apriori algorithm. Provide screenshots that demonstrate the error-free functionality of the code.

### 3.  Provide values for the support, lift, and confidence of the association rules table.

### 4.  Identify the top three rules generated by the Apriori algorithm. Include a screenshot of the top rules along with their summaries.
- DATACAMP
    - Apriori principle.
        - Subsets of frequent sets are frequent.
        - Retain sets known to be frequent.
        - Prune sets not known to be frequent.
    - Apriori prunes itemsets.
        - Applies minimum support threshold.
        - Modified version can prune by number of items.
        - Doesn't tell us about association rules.


# Part IV: Data Summary and Implications

## D.  Summarize your data analysis by doing the following:

### 1.  Summarize the significance of support, lift, and confidence from the results of the analysis.
- https://pbpython.com/market-basket-analysis.html
    - Support is the relative frequency that the rules show up. In many instances, you may want to look for high support in order to make sure it is a useful relationship. However, there may be instances where a low support is useful if you are trying to find “hidden” relationships.
    - Confidence is a measure of the reliability of the rule. A confidence of .5 in the above example would mean that in 50% of the cases where Diaper and Gum were purchased, the purchase also included Beer and Chips. For product recommendation, a 50% confidence may be perfectly acceptable but in a medical situation, this level may not be high enough.
    - Lift is the ratio of the observed support to that expected if the two rules were independent (see wikipedia). The basic rule of thumb is that a lift value close to 1 means the rules were completely independent. Lift values > 1 are generally more “interesting” and could be indicative of a useful rule pattern.
- Support
    - The significance of Support from the analysis results is
    - DATACAMP
        - The support metric measures the share of transactions that contain an itemset.
            - number of transactions with items(s) / number of transactions
- Lift
    - The significance of Lift from the analysis results is
    - DATACAMP
        - Lift provides another metric for evaluating the relationship between items.
        - Support(X&Y) / (Support(X) * Support(Y))
            - Numerator: Proportion of transactions that contain X and Y.
            - Denominator: Proportion if X and Y assigned randomly and independently.
- Confidence
    - The significance of Confidence from the analysis results is
    - DATACAMP
        - 1. Can improve over support with additional metrics.
        - Adding con,dence provides a more complete picture.
        - Support(X&Y) / Support(X)

### 2.  Discuss the practical significance of the findings from the analysis.
- The significance of the analysis findings is

### 3.  Recommend a course of action for the real-world organizational situation from part A1 based on your results from part D1.
- My recommended course of action based on the analysis findings is
 

# Part V: Attachments

## E.  Provide a Panopto video recording that includes a demonstration of the functionality of the code used for the analysis and a summary of the programming environment.
 

Note: The audiovisual recording should feature you visibly presenting the material (i.e., not in voiceover or embedded video) and should simultaneously capture both you and your multimedia presentation.
 

Note: For instructions on how to access and use Panopto, use the "Panopto How-To Videos" web link provided below. To access Panopto's website, navigate to the web link titled "Panopto Access," and then choose to log in using the “WGU” option. If prompted, log in using your WGU student portal credentials, and then it will forward you to Panopto’s website.
 

To submit your recording, upload it to the Panopto drop box titled “Data Mining II – OFM3.” Once the recording has been uploaded and processed in Panopto's system, retrieve the URL of the recording from Panopto and copy and paste it into the Links option. Upload the remaining task requirements using the Attachments option.
 

## F.  Record all web sources used to acquire data or segments of third-party code to support the application. Ensure the web sources are reliable.
- https://pbpython.com/market-basket-analysis.html

## G.  Acknowledge sources, using in-text citations and references, for content that is quoted, paraphrased, or summarized.
- https://pbpython.com/market-basket-analysis.html

## H.  Demonstrate professional communication in the content and presentation of your submission.