### Replace occurences of CODE_HERE with your own code and evaluate your results by executing all boxes with assert statements
### Import the functions that we need for this lab session

In [135]:
import csv

from functools import partial
from collections import defaultdict, Counter
from itertools import count, permutations, combinations

In [20]:
def apply(data, func):
    """
    applies the function *func* to each element in the list *data*
    """
    for element in data:
        yield func(element)

### Define the first data set

In [21]:
data = [0,1,2,3,4,5,6,7,8,9]

### Use the above defined *apply* function to apply the *square* function to the data set defined in *data*

In [22]:
def square(x):
    return x**2

In [23]:
result = [i for i in apply(data, square)]

In [24]:
expected = [0,1,4,9,16,25,36,49,64,81]
assert expected == result

### Define and use a lambda function that adds 2 to each element in *data*

In [28]:
lambda_function = lambda x: x+2
result = [i for i in apply(data, lambda_function)]

In [29]:
expected = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
assert expected == result

### Do the same as above, but make use of the below defined add function

In [32]:
def add(x, constant=1):
    return x+constant

In [33]:
partial_function = partial(add, constant=2)
result = [i for i in apply(data, partial_function)]

In [34]:
expected = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
assert expected == result

### Let's load a more interesting data set </br>
### Some reddit records with the following header</br>
name,category,amazon_link,total_mentions,subreddit_mentions </br>
### An example of a CSV record</br>
Belkin Speaker and Headphone 3.5 mm AUX Audio Cable Splitter,Electronics,https://www.amazon.com/Belkin-Speaker-and-Headphone-Splitter/dp/B000067RC4/ref=sr_1_3?ie=UTF8&amp;qid=1492359433&amp;sr=8-3&amp;keywords=2+way+headphone+splitter,7,1 </br>
### An example of a JSON record after loading the file
```json
{
    "name": "Belkin Speaker and Headphone 3.5 mm AUX Audio Cable Splitter",
    "category": "Electronics",
    "amazon_link": "https://www.amazon.com/Belkin-Speaker-and-Headphone-Splitter/dp/B000067RC4/ref=sr_1_3?ie=UTF8&amp;qid=1492359433&amp;sr=8-3&amp;keywords=2+way+headphone+splitter",
    "total_mentions": "7",
    "subreddit_mentions": "1"
}
```

In [71]:
def load_csv_data(file_path):
    with open(file_path, encoding="utf-8") as fin:
        csvreader = csv.DictReader(fin, delimiter=',',quotechar='"')
        data = [row for row in csvreader]
    return data

data = load_csv_data("data/amazonecho.csv")

### All values in the records are still strings, first transform the values for *total_mentions* and *subreddit_mentions* to integers

In [72]:
def transform_to_int(record):
    for key in ["total_mentions", "subreddit_mentions"]:
        record[key] = int(record[key])
    return record

In [73]:
data = [e for e in apply(data, transform_to_int)]

In [74]:
for e in data:
    assert type(e["total_mentions"]) == int
    assert type(e["subreddit_mentions"]) == int

### sort all records according to their total_mentions from high to low

In [75]:
data = sorted(data, key=lambda x: x["total_mentions"], reverse=True)

In [76]:
assert data[0]["name"] == "Belkin Speaker and Headphone 3.5 mm AUX Audio Cable Splitter"

### sort all records according to their category and then according to the total_mentions from high to low

In [84]:
data = sorted(data, key=lambda x: (x["category"], x["total_mentions"]), reverse=True)

In [89]:
assert data[0]["category"] == "Tools & Home Improvement" and data[0]["total_mentions"] == 4

### count the number of records per category and sort from high to low

In [96]:
counter = Counter()

for record in data:
    counter[record["category"]] += 1
    
categories = sorted(counter.items(), key=lambda x: x[1], reverse=True)

In [98]:
assert categories[0] == ("Electronics", 17)

### add to each record the number of words in the *name* field of each record using the *apply* function and sort from high to low

Hint: just use split to split the name string into words

Example <br>
```json
{
    "name": "Belkin Speaker and Headphone 3.5 mm AUX Audio Cable Splitter",
    "category": "Electronics",
    "amazon_link": "https://www.amazon.com/Belkin-Speaker-and-Headphone-Splitter/dp/B000067RC4/ref=sr_1_3?ie=UTF8&amp;qid=1492359433&amp;sr=8-3&amp;keywords=2+way+headphone+splitter",
    "total_mentions": 7,
    "subreddit_mentions": 1,
    "n_words": 10
}
```

In [104]:
def number_of_words(record):
    n_words = len(record["name"].split(" "))
    record["n_words"] = n_words
    return record

number_of_words = [rec for rec in apply(data, number_of_words)]
number_of_words = sorted(number_of_words, key=lambda x: x["n_words"], reverse=True)

In [109]:
assert number_of_words[0]["name"] == "HAVIT Bluetooth 4.1 Transmitter Receiver(aptX), Pair 2 at Once, Mini Wireless Portable Bluetooth Adapter to 3.5mm Audio Devices and Home Stereo, Such as TV, MP3, CD Player, PC, eBook Reader (HV-BT018)"
assert number_of_words[0]["n_words"] == 31

### build a dictionary that maps a category to a list of its record names

Hint: use a defaultdict

In [111]:
category2names = defaultdict(list)

for record in data:
    category2names[record["category"]].append(record["name"])

In [117]:
assert len(category2names["Electronics"]) == 17

### Imagine we want to construct a feature matrix where the rows/samples are the records and the features the words in the record name
### To achieve this we would have to first extract the words of each record and then map the words to a column index
### In the below exercise we will map the words to an unique index starting from zero
Hint: use the count function (in the itertools module) together with defaultdict 
https://docs.python.org/2/library/itertools.html <br>
As before just use split to get the words per record

In [121]:
cnt = count()
indexer = defaultdict(lambda: next(cnt))

In [123]:
def map_word_to_index(record, indexer):
    for word in record["name"].split(" "):
        indexer[word]

In [124]:
for record in data:
    map_word_to_index(record, indexer)

In [127]:
assert max(indexer.values()) == 349

### Generate all 2-tuple category combinations

In [136]:
category_tuples = [tpl for tpl in combinations({record["category"] for record in data}, 2)]

In [139]:
assert len(category_tuples) == 28