In [1]:
import os
import pandas as pd
import mlxtend
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder
#import time
#from mlxtend.frequent_patterns import fpgrowth

## Data Cleaning

In [2]:
# attempt to get all the files within the balnton_data directory
path = r"./blanton_data/"
directories = os.listdir(path)

# this is the master dataframe
df_master = pd.DataFrame()

# these are the transaction databases, stored as indexes in a dictionary
# dictionary of dictionaries of arrays
transaction_database = {
    'emotional' : {},
    'aesthetic' : {},
    'either' : {}
}

valid_cols = ['accession_#', 'artist_sort_name', 'artist_life_dates',
           'artist_nationality', 'title', 'creation_date', 'medium', 'credit_line',
           'dimensions', 'student_id', 'emotional_reaction', 'aesthetically_pleasing']

In [3]:
def update_transaction_database(student_df):
    # get the student_id to create a new entry in the transaction_database
    student_id = student_df.at[0, 'student_id']
    
    # create new entry in database
    transaction_database['emotional'][student_id] = []
    transaction_database['aesthetic'][student_id] = []
    transaction_database['either'][student_id] = []
    
    for index, row in student_df.iterrows():
        is_emotional = row['emotional_reaction'] == 1
        is_aesthetic = row['aesthetically_pleasing'] == 1
        
        if is_emotional:
            transaction_database['emotional'][student_id].append(row['accession_#'])
        if is_aesthetic:
            transaction_database['aesthetic'][student_id].append(row['accession_#'])
        if is_emotional or is_aesthetic:
            transaction_database['either'][student_id].append(row['accession_#'])

In [4]:
def clean_temp(file, df_temp):
    # lowercase and ignore whitespace for consistency
    df_temp.columns = map(str.lower, df_temp.columns)
    new_columns = []
    for column in df_temp.columns:
        new_column = column.strip().replace(" ", "_")
        new_columns.append(new_column)
    df_temp.columns = new_columns

    # correct bad spelling
    if 'asthetically_pleasing' in df_temp.columns:
        df_temp = df_temp.rename(columns={'asthetically_pleasing' : 'aesthetically_pleasing'})

    # abhi's case - student_id missing
    if 'student_id' not in df_temp.columns:
        gen_id = file.split('_')[0]
        df_temp['student_id'] = gen_id
    return df_temp.copy()

In [5]:
for file in directories:
    rel_path = path + file
    if file.endswith('.xlsx'):
        df_temp = pd.read_excel(rel_path)
    
        df_temp = clean_temp(file, df_temp)
        
        # student_id, emotional_reaction, and aesthetically_pleasing
        # are not always there, filter out the ones that don't have one of these columns
        is_valid = 'emotional_reaction' in df_temp.columns and 'aesthetically_pleasing' in df_temp.columns
        
        if(is_valid):
            
            # removed the 'unnamed' feilds
            df_temp = df_temp[valid_cols]
            
            # add an 'either' column
            # df_temp['either'] = np.where((df_temp['emotional_reaction'] | df_temp['aesthetically_pleasing'] > 0), 1, 0)
            df_temp['either'] = df_temp['emotional_reaction'].astype(int) + df_temp['aesthetically_pleasing'].astype(int)
            
            update_transaction_database(df_temp)

            df_master = df_master.append(df_temp, ignore_index = True)

# sort by student_id to make things easier
df_master = df_master.sort_values(by=['student_id'])
df_master
        
#get list of student ids
students = df_master['student_id']
students = students.drop_duplicates()
students

986     AnaW4804
2251      Frp323
2166    ahasbany
60       akp2597
1092    akshay17
915     araman18
438     ashk2016
418         ayan
585        benli
2402    brandonn
2049    caitlien
797     cjenwere
1920     colette
2324    ericamtz
282        gokul
238     gperez13
1681     gskaggs
738      hh26257
135     hrithikr
1516         ich
524      jrm7328
2641    kevliang
2502     kjh2858
1240     kjoseph
1628    kushalcd
1410      lgm977
1261     manders
691        maram
1803    mshao123
2398    nithin13
1602      nk9373
2139      nz3222
2759      pa8789
324          poi
2039    pranooha
2822     preston
1336    rahulram
2536     raymond
1038       riz74
2760    rsmoreno
2998    ryanyz10
600      serfurt
2599    shaniyur
2938    shyampat
899      simon18
1460    snowaski
1861     sr46252
0        sra2398
3060     st33578
1140    tchatter
1778       viswa
Name: student_id, dtype: object

# Data Analysis
## Questions to Answer:
- What artist was the most likely to elicit an emotional reaction?
- What artist was the most likely to elicit an aesthetically pleasing reaction?
- What pieces of art were the most widely judged to have any reaction?
- What student had the most “1” reactions – find the art lover
- What student had the most “0” reactions – find the art grinch



In [6]:
"""
returns a dictionary tallying the count of a certain attrbute 
for each unique item defined in category

df - dataframe
category - distinct items
tally_by - column to count number of distinct items by
target - target value to add +1 to the tally
"""
def get_count(df, category, tally_by, target, by_one):
    count = {}
    for index, row in df.iterrows():
        
        inc = 1
        if by_one == False:
            inc = row[tally_by]
        
        if row[tally_by] >= target:
            if row[category] not in count:
                count[row[category]] = inc
                
            else:
                count[row[category]] += inc
    return count

In [7]:
def max_count(count):
    return max(count, key=count.get)
def min_count(count):
    return min(count, key=count.get)

In [8]:
# What artist was the most likely to elicit an emotional reaction?
emo_artist = get_count(df_master, 'artist_sort_name', 'emotional_reaction', 1, True)

# What artist was the most likely to elicit an aesthetically pleasing reaction?
aes_artist = get_count(df_master, 'artist_sort_name', 'aesthetically_pleasing', 1, True)

# What pieces of art were the most widely judged to have any reaction?
reaction_art = get_count(df_master, 'title', 'either', 1, True)

# What student had the most “1” reactions – find the art lover
# What student had the most “0” reactions – find the art grinch
lover_student = get_count(df_master, 'student_id', 'either', 1, False)

In [9]:
print("What artist was the most likely to elicit an emotional reaction?")
print(max_count(emo_artist))
print()
print("What artist was the most likely to elicit an aesthetically pleasing reaction?")
print(max_count(aes_artist))
print()
print("What pieces of art were the most widely judged to have any reaction?")
print(max_count(reaction_art))
print()
print("What student had the most “1” reactions – find the art lover")
print(max_count(lover_student))
print()
print("What student had the most “0” reactions – find the art grinch")
print(min_count(lover_student))

What artist was the most likely to elicit an emotional reaction?
Camnitzer, Luis

What artist was the most likely to elicit an aesthetically pleasing reaction?
Anonymous

What pieces of art were the most widely judged to have any reaction?
Untitled

What student had the most “1” reactions – find the art lover
st33578

What student had the most “0” reactions – find the art grinch
ich


## Getting Frequent Itemsets

In [10]:
dataset = []
for student in students:
    art = transaction_database['aesthetic'][student]
    art = [str(artpiece) for artpiece in art]
    dataset.append(art)

In [11]:
te = TransactionEncoder()
te_array = te.fit(dataset).transform(dataset)
df_item = pd.DataFrame(te_array, columns = te.columns_)
df_item = df_item.set_index(students)
df_item.head()

Unnamed: 0_level_0,1977.112,1977.119,1977.25,1977.32,1977.9,1978.46,1978.5,1978.89,1978.93,1979.30,...,P1974.11.2,P1975.18.1/15,P1975.18.10/15,P1975.18.2/15,P1975.18.6/15,P1975.18.7/15,P1975.21.1,P1975.23.3,P1976.10.1,P1976.13.2
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AnaW4804,True,False,False,False,True,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
Frp323,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
ahasbany,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
akp2597,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
akshay17,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True


In [12]:
#decided to use apriori because trees dont save time here 
#time_ap = time.time()
frequent_itemsets = apriori(df_item, min_support =0.03, use_colnames = True)
#time_ap = time.time() - time_ap

#time_tree = time.time()
#frequent_itemsets_tree = fpgrowth(df_item, min_support =0.03)
#time_tree = time.time() - time_tree
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.058824,(1977.112)
1,0.098039,(1977.119)
2,0.058824,(1977.25)
3,0.039216,(1977.32)
4,0.078431,(1977.9)
...,...,...
1093,0.039216,"(2017.1285, 2017.282, 1986.361.21/25, 2005.151..."
1094,0.039216,"(2017.1285, 2017.282, 2017.1087, 1986.361.21/2..."
1095,0.039216,"(2002.2589, 2016.147, 2017.106, 2004.45, G1976..."
1096,0.039216,"(2017.1285, 2017.282, 2017.1087, 2005.151, 201..."


## Generate Association Rules

In [13]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(2017.429),(1977.112),0.058824,0.058824,0.039216,0.666667,11.333333,0.035755,2.823529
1,(1977.112),(2017.429),0.058824,0.058824,0.039216,0.666667,11.333333,0.035755,2.823529
2,(1977.119),(1991.187),0.098039,0.156863,0.039216,0.400000,2.550000,0.023837,1.405229
3,(1991.187),(1977.119),0.156863,0.098039,0.039216,0.250000,2.550000,0.023837,1.202614
4,(1977.119),(2002.2171),0.098039,0.039216,0.039216,0.400000,10.200000,0.035371,1.601307
...,...,...,...,...,...,...,...,...,...
2755,(2017.282),"(2017.1285, 2017.1087, 1986.361.21/25, 2005.15...",0.039216,0.039216,0.039216,1.000000,25.500000,0.037678,inf
2756,(2017.1087),"(2017.1285, 2017.282, 1986.361.21/25, 2005.151...",0.098039,0.039216,0.039216,0.400000,10.200000,0.035371,1.601307
2757,(1986.361.21/25),"(2017.1285, 2017.282, 2017.1087, 2005.151, 201...",0.117647,0.039216,0.039216,0.333333,8.500000,0.034602,1.441176
2758,(2005.151),"(2017.1285, 2017.282, 2017.1087, 1986.361.21/2...",0.058824,0.039216,0.039216,0.666667,17.000000,0.036909,2.882353


## Apply Association Rules

In [14]:
rules = rules[ (rules['lift'] >= 23) &
      (rules['confidence'] == 1) ]
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
30,(1977.32),(2003.29),0.039216,0.039216,0.039216,1.0,25.5,0.037678,inf
31,(2003.29),(1977.32),0.039216,0.039216,0.039216,1.0,25.5,0.037678,inf
68,(P1967.1.17/20),(1980.72.10/10),0.039216,0.039216,0.039216,1.0,25.5,0.037678,inf
69,(1980.72.10/10),(P1967.1.17/20),0.039216,0.039216,0.039216,1.0,25.5,0.037678,inf
222,(1995.19),(1983.132.21/55),0.039216,0.039216,0.039216,1.0,25.5,0.037678,inf
...,...,...,...,...,...,...,...,...,...
2750,"(2017.1087, 2017.912)","(1986.361.21/25, 2017.1285, 2005.151, 2017.282)",0.039216,0.039216,0.039216,1.0,25.5,0.037678,inf
2751,"(1986.361.21/25, 2005.151)","(2017.1087, 2017.1285, 2017.282, 2017.912)",0.039216,0.039216,0.039216,1.0,25.5,0.037678,inf
2752,"(1986.361.21/25, 2017.912)","(2017.1087, 2017.1285, 2005.151, 2017.282)",0.039216,0.039216,0.039216,1.0,25.5,0.037678,inf
2753,"(2005.151, 2017.912)","(2017.1087, 2017.1285, 1986.361.21/25, 2017.282)",0.039216,0.039216,0.039216,1.0,25.5,0.037678,inf


## Creating Recomendations Per Student

In [15]:
# dataset = [str(data) for data in dataset]
# student_likes = set(dataset)

# ants = set(rules['antecedents'])
# ants = [set(x) for x in ants]
# ants = [str(data) for data in ants]
# ants

# intsect = student_likes.intersection(ants)
# intsect

In [16]:
df_rec = pd.DataFrame()
df_rec['Student'] = students
df_rec['Recomendation 1'] = 'NA'
df_rec['Recomendation 2'] = 'NA'
df_rec['Recomendation 3'] = 'NA'
df_rec = df_rec.set_index('Student')

In [17]:
for student, data in df_rec.iterrows():
    student_likes = transaction_database['aesthetic'][student]
    student_likes = [str(artpiece) for artpiece in student_likes]
    if(len(student_likes) > 0):
        index = 1;
        for like in student_likes: 
            consq = rules[rules['antecedents'] == like]
            print(consq)
            conq = set(consq['consequents'])
            print(conq)
            setLikes = set(student_likes)
            diff = conq.difference(setLikes)
            print(diff)
            for art in diff:
                if(index == 1):
                    df_rec.at[student, 'Recomendation 1'] = art
                if(index == 2):
                    df_rec.at[student, 'Recomendation 2'] = art
                if(index == 3):
                    df_rec.at[student, 'Recomendation 2'] = art
                if(index > 3):
                    break
                index += 1 
            if (index == 4):
                break
            
    #if(len(student_likes) == 0):
        #return top three most popular art works

Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Column

Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Column

Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Column

Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Column

Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Column

Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()


Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
set()
set()
Empty DataFrame
Column

In [18]:
df_rec

Unnamed: 0_level_0,Recomendation 1,Recomendation 2,Recomendation 3
Student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AnaW4804,,,
Frp323,,,
ahasbany,,,
akp2597,,,
akshay17,,,
araman18,,,
ashk2016,,,
ayan,,,
benli,,,
brandonn,,,
