In [1]:
import os

import numpy as np

import pandas as pd
from pandas import Series, DataFrame

import matplotlib.pyplot as plt

In [2]:
weights = DataFrame()
file = 'scores.xlsx'

raw_product = pd.read_excel(file, sheet_name='Product', skiprows=2, usecols=3)
raw_product['page'] = 'product'
raw_company = pd.read_excel(file, sheet_name='Company', skiprows=2, usecols=3)
raw_company['page'] = 'company'

raw = raw_product.append(raw_company)
raw = raw.reset_index(drop=True)
raw.columns = ['requirement', 'score', 'notes', 'weight', 'page']
del raw['score']
del raw['notes']


# starting at the last category
# extract the rows for the category to the end of the df
# remove the rows
header_indexes = raw.index[raw['requirement'] == 'Requirement'].tolist()
header_indexes = list(reversed(header_indexes))

for hi in header_indexes:
    category = raw.iloc[hi-1][0]
    sub_weights = DataFrame(raw.iloc[hi+1:])
    sub_weights['category'] = category
    sub_weights = sub_weights[pd.notnull(sub_weights['requirement'])]

    weights = weights.append(sub_weights)
    raw = raw.iloc[0:hi-1]

    
weights = weights.reset_index(drop=True)
weights

Unnamed: 0,requirement,weight,page,category
0,D&B Rating,5,company,Corporate Viability
1,Years in business,1,company,Corporate Viability
2,Hold time less than 5 minutes,5,company,Technical Support
3,Resolution on first call,5,company,Technical Support
4,Polite,2,company,Technical Support
5,Pleasant apppearance,1,product,Packaging
6,Easy to open,1,product,Packaging
7,Recyclable,2,product,Packaging
8,Must be able to hold 20 oz,5,product,Technical Specs
9,Flexible,2,product,Technical Specs


In [3]:
scores = DataFrame()
files = [f for f in os.listdir('.') if f.endswith('-scores.xlsx')]

for file in files:
    raw = pd.read_excel(file, sheet_name='Product', usecols=1, header=None)
    scorer, product = raw.iloc[0][1], raw.iloc[1][1]

    raw_product = pd.read_excel(file, sheet_name='Product', skiprows=2, usecols=2)
    raw_product['page'] = 'product'
    raw_company = pd.read_excel(file, sheet_name='Company', skiprows=2, usecols=2)
    raw_company['page'] = 'company'


    raw = raw_product.append(raw_company)
    raw = raw.reset_index(drop=True)
    raw.columns = ['requirement', 'score', 'notes', 'page']

    # starting at the last category
    # extract the rows for the category to the end of the df
    # remove the rows
    header_indexes = raw.index[raw['requirement'] == 'Requirement'].tolist()
    header_indexes = list(reversed(header_indexes))

    for hi in header_indexes:
        category = raw.iloc[hi-1][0]
        sub_scores = DataFrame(raw.iloc[hi+1:])
        sub_scores['category'] = category
        sub_scores['scorer'] = scorer
        sub_scores['product'] = product
        sub_scores = sub_scores[pd.notnull(sub_scores['requirement'])]

        scores = scores.append(sub_scores)
        raw = raw.iloc[0:hi-1]

    
scores = scores.reset_index(drop=True)
# scores = pd.merge(scores, weights, on=['category', 'page','requirement'])
scores.head()


Unnamed: 0,requirement,score,notes,page,category,scorer,product
0,D&B Rating,4,,company,Corporate Viability,Jane Doe,Acme Widget
1,Years in business,5,,company,Corporate Viability,Jane Doe,Acme Widget
2,Hold time less than 5 minutes,3,,company,Technical Support,Jane Doe,Acme Widget
3,Resolution on first call,5,,company,Technical Support,Jane Doe,Acme Widget
4,Polite,5,,company,Technical Support,Jane Doe,Acme Widget


In [4]:
cleansed = scores.dropna(axis=0, subset=['score'])
cleansed

Unnamed: 0,requirement,score,notes,page,category,scorer,product
0,D&B Rating,4,,company,Corporate Viability,Jane Doe,Acme Widget
1,Years in business,5,,company,Corporate Viability,Jane Doe,Acme Widget
2,Hold time less than 5 minutes,3,,company,Technical Support,Jane Doe,Acme Widget
3,Resolution on first call,5,,company,Technical Support,Jane Doe,Acme Widget
4,Polite,5,,company,Technical Support,Jane Doe,Acme Widget
5,Pleasant apppearance,4,,product,Packaging,Jane Doe,Acme Widget
6,Easy to open,1,,product,Packaging,Jane Doe,Acme Widget
7,Recyclable,3,,product,Packaging,Jane Doe,Acme Widget
8,Must be able to hold 20 oz,5,,product,Technical Specs,Jane Doe,Acme Widget
9,Flexible,2,,product,Technical Specs,Jane Doe,Acme Widget


In [5]:
score_series = cleansed['score']
score_series = score_series.astype('int64')
score_series


0     4
1     5
2     3
3     5
4     5
5     4
6     1
7     3
8     5
9     2
10    4
12    3
13    2
14    4
15    2
16    3
17    2
18    0
19    5
20    4
21    1
22    3
23    5
24    5
25    5
26    2
27    1
28    1
29    1
30    5
32    1
33    2
34    2
35    3
36    3
37    3
38    2
39    2
40    0
41    5
42    1
43    1
Name: score, dtype: int64

In [6]:
cleansed['score'] = score_series
cleansed['score']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


0     4
1     5
2     3
3     5
4     5
5     4
6     1
7     3
8     5
9     2
10    4
12    3
13    2
14    4
15    2
16    3
17    2
18    0
19    5
20    4
21    1
22    3
23    5
24    5
25    5
26    2
27    1
28    1
29    1
30    5
32    1
33    2
34    2
35    3
36    3
37    3
38    2
39    2
40    0
41    5
42    1
43    1
Name: score, dtype: int64

In [7]:
minimums = cleansed.sort_values('score').groupby(['product','page', 'category', 'requirement'], as_index=False).first()
maximums = cleansed.sort_values('score').groupby(['product','page', 'category', 'requirement'], as_index=False).last()
means = cleansed.groupby(['product','page', 'category', 'requirement'], as_index=False)['score'].mean()
counts = cleansed.groupby(['product','page', 'category', 'requirement'], as_index=False).size().reset_index(name='responses')

minimums = minimums.rename({'score': 'minimum score', 'notes': 'minimum notes', 'scorer': 'minimum scorer'}, axis='columns')
maximums = maximums.rename({'score': 'maximum score','notes': 'maximum notes', 'scorer': 'maximum scorer'}, axis='columns')
means = means.rename({'score': 'mean score'}, axis='columns')

aggregate = pd.merge(minimums, maximums, on=['product','page', 'category', 'requirement'])
aggregate = pd.merge(aggregate, means, on=['product','page', 'category', 'requirement'])
aggregate = pd.merge(aggregate, counts, on=['product','page', 'category', 'requirement'])

aggregate

Unnamed: 0,product,page,category,requirement,minimum score,minimum notes,minimum scorer,maximum score,maximum notes,maximum scorer,mean score,responses
0,Acme Widget,company,Corporate Viability,D&B Rating,3,,John Doe,4,,Jane Doe,3.5,2
1,Acme Widget,company,Corporate Viability,Years in business,5,,John Doe,5,,Jane Doe,5.0,2
2,Acme Widget,company,Technical Support,Hold time less than 5 minutes,3,,Jane Doe,5,,John Doe,4.0,2
3,Acme Widget,company,Technical Support,Polite,2,,John Doe,5,,Jane Doe,3.5,2
4,Acme Widget,company,Technical Support,Resolution on first call,5,,John Doe,5,,Jane Doe,5.0,2
5,Acme Widget,product,Packaging,Easy to open,1,,John Doe,1,,Jane Doe,1.0,2
6,Acme Widget,product,Packaging,Pleasant apppearance,1,,John Doe,4,,Jane Doe,2.5,2
7,Acme Widget,product,Packaging,Recyclable,1,,John Doe,3,,Jane Doe,2.0,2
8,Acme Widget,product,Technical Specs,Flexible,2,,Jane Doe,2,,Jane Doe,2.0,1
9,Acme Widget,product,Technical Specs,Gizmos to the wingnut,1,,John Doe,4,,Jane Doe,2.5,2
