# Using Python for Research Homework: Week 3, Case Study 2

In this case study, we will find and plot the distribution of word frequencies for each translation of Hamlet.  Perhaps the distribution of word frequencies of Hamlet depends on the translation --- let's find out!

In [1]:
# DO NOT EDIT THIS CODE!
import os
import pandas as pd
import numpy as np
from collections import Counter

def count_words_fast(text):
    text = text.lower()
    skips = [".", ",", ";", ":", "'", '"', "\n", "!", "?", "(", ")"]
    for ch in skips:
        text = text.replace(ch, "")
    word_counts = Counter(text.split(" "))
    return word_counts

def word_stats(word_counts):
    num_unique = len(word_counts)
    counts = word_counts.values()
    return (num_unique, counts)

### Exercise 1 

In this case study, we will find and visualize summary statistics of the text of different translations of Hamlet. For this case study, functions `count_words_fast` and `word_stats` are already defined as in the Case 2 Videos (Videos 3.2.x).

#### Instructions 
- Read in the data as a pandas dataframe using `pd.read_csv`. Use the `index_col` argument to set the first column in the csv file as the index for the dataframe. The data can be found at https://courses.edx.org/asset-v1:HarvardX+PH526x+2T2019+type@asset+block@hamlets.csv

In [2]:
hamlets = pd.read_csv("./asset-v1_HarvardX+PH526x+2T2019+type@asset+block@hamlets.csv",index_col = 0)

### Exercise 2 

In this exercise, we will summarize the text for a single translation of Hamlet in a `pandas` dataframe. 

#### Instructions
- Find the dictionary of word frequency in `text` by calling `count_words_fast()`. Store this as `counted_text`.
- Create a `pandas` dataframe named `data`.
- Using `counted_text`, define two columns in data:
    - `word`, consisting of each unique word in text.
    - `count`, consisting of the number of times each word in `word` is included in the text.

In [3]:
language, text = hamlets.iloc[0]
counted_text = count_words_fast(text)
data = pd.DataFrame()
data["word"] = counted_text.keys()
data["count"] = counted_text.values()
language

'English'

### Exercise 3

In this exercise, we will continue to define summary statistics for a single translation of Hamlet. 

#### Instructions
- Add a column to data named `length`, defined as the length of each word.
- Add another column named `frequency`, which is defined as follows for each word in `data`:
    - If `count > 10`, `frequency` is "frequent".
    - If `1 < count <= 10`, `frequency` is "infrequent".
    - If `count == 1`, `frequency` is "unique".

In [4]:
# Using list compreshension to count lenght of counted words from the external counted text dictionary
# and store them into the pandas data["length"] collumn.
data["length"] = [len(i) for i in counted_text.keys()]

# Use a nested numpy where statement -> be careful though: it may not work. 
# Notice that the else statement is another np.wher function.

data["frequency"] = np.where(data["count"] > 10, "frequent",
                             np.where(data["count"] == 1, "unique", "infrequent"))
                             

In [5]:
# Exercise 3 Answer #
unique_freq = 0
for i in data["frequency"]:
    if i == "unique":
        unique_freq += 1
unique_freq

3348

In [6]:
data

Unnamed: 0,word,count,length,frequency
0,the,935,3,frequent
1,tragedie,3,8,infrequent
2,of,576,2,frequent
3,hamlet,97,6,frequent
4,,45513,0,frequent
...,...,...,...,...
5108,shooteexeunt,1,12,unique
5109,marching,1,8,unique
5110,peale,1,5,unique
5111,ord,1,3,unique


### Exercise 4 - The Long Way

In this exercise, we will summarize the statistics in data into a smaller pandas dataframe. 

#### Instructions 
- Create a `pandas` dataframe named `sub_data` including the following columns:
    - `language`, which is the language of the text (defined in Exercise 2).
    - `frequency`, which is a list containing the strings "frequent", "infrequent", and "unique".
    - `mean_word_length`, which is the mean word length of each value in frequency.
    - `num_words`, which is the total number of words in each frequency category.

In [14]:
# write your code here!
sub_data = pd.DataFrame()
sub_data["language"] = [language, language, language]
sub_data["frequency"] = ["frequent", "infrequent", "unique"]

(mean_freq, mean_infreq, mean_uniq) = mean_word_length(data)
sub_data["mean_word_length"] = [mean_freq, mean_infreq, mean_uniq]

(num_frequent, num_infrequent, num_unique) = num_freq(data)
sub_data["num_words"] = [num_frequent, num_infrequent, num_unique]


def num_freq(data):
    i = 0
    frequent = 0
    infrequent = 0 
    unique = 0
    for item in range(data["frequency"].count()):
        if data["frequency"][i] == "frequent":
            frequent = frequent + 1
        if data["frequency"][i] == "infrequent":
            infrequent = infrequent + 1
        if data["frequency"][i] == "unique":
            unique = unique + 1     
        i = i + 1
    return (frequent, infrequent, unique)


import numpy as np

def mean_word_length(data):
    i = 0 
    feq_len_list = []
    infreq_len_list = []
    uniq_len_list = []
    for item in range(data["frequency"].count()):
        if data["frequency"][i] == "frequent":
            feq_len_list.append(data["length"][i])
        if data["frequency"][i] == "infrequent":
            infreq_len_list.append(data["length"][i])
        if data["frequency"][i] == "unique":
            uniq_len_list.append(data["length"][i])
        i = i + 1
    
    return (np.mean(feq_len_list), np.mean(infreq_len_list), np.mean(uniq_len_list))

In [15]:
sub_data

Unnamed: 0,language,frequency,mean_word_length,num_words
0,English,frequent,4.371517,323
1,English,infrequent,5.825243,1442
2,English,unique,7.005675,3348


### Exercise 4 - The Short Way


In [29]:
## Here to shorten our code: we use an alternative way to define a pandas data frame
## and we use the groupby function of the pandas data frame.
## groupby retruns a grouped object of the pandas data frame. So when we specify by 
## "frequency" everything is broken down into its respective frequency categoires
## you can specify further into "lengths" and find means of those length "lists" although
## technically speaking these objects are more complex than lists.

## PANDAS Automatically strips the data a little so an entry like: 
## data.groupby(by = "frequency")["length"].mean() -> doesn't return a super compatible
## PANDAS object, but PANDAS makes it fit nicely. Even the language: language declaration 
## is just one word but PANDAS Sees that htere are three entries and applies that declaration
## to the rest of the words in the list.

##Numwords asks for number of unique words of each category. But that is not actually
## asked anywhere here. 

sub_data = pd.DataFrame({"language" : language,
                         "frequency" : ["frequent", "infrequent", "unique"],
                         "mean_word_length" : data.groupby(by = "frequency")["length"].mean(),
                         "num_words" : data.groupby(by = "frequency")["count"].size()
                          })
sub_data


Unnamed: 0_level_0,language,frequency,mean_word_length,num_words
frequency,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
frequent,English,frequent,4.371517,323
infrequent,English,infrequent,5.825243,1442
unique,English,unique,7.005675,3348


In [31]:
## GUTS OF GROUPBY FUNCTION ##

sub_data = data.groupby(by = "frequency")["length"]
sub_data

for key, item in sub_data:
    print(key)
    print(item)

frequent
0       3
2       2
3       6
4       0
9       5
       ..
2190    4
2306    7
2994    7
4430    3
4864    3
Name: length, Length: 323, dtype: int64
infrequent
1        8
5        5
10       8
15       4
17       4
        ..
4990     8
5002     7
5011    10
5018     8
5024     4
Name: length, Length: 1442, dtype: int64
unique
6        6
7        6
8        5
12       9
14       9
        ..
5108    12
5109     8
5110     5
5111     3
5112     5
Name: length, Length: 3348, dtype: int64


### Exercise 5

In this exercise, we will join all the data summaries for text Hamlet translation.

#### Instructions 
- The previous code for summarizing a particular translation of Hamlet is consolidated into a single function called `summarize_text`. Create a pandas dataframe` grouped_data` consisting of the results of `summarize_text` for each translation of Hamlet in `hamlets`.
    - Use a `for` loop across the row indices of `hamlets` to assign each translation to a new row.
    - Obtain the `ith` row of `hamlets` to variables using the `.iloc` method, and assign the output to variables `language` and `text`.
    - Call `summarize_text` using `language` and `text`, and assign the output to `sub_data`.
    - Use the pandas `.append()` function to append to pandas dataframes row-wise to `grouped_data`.

In [None]:
def summarize_text(language, text):
    counted_text = count_words_fast(text)

    data = pd.DataFrame({
        "word": list(counted_text.keys()),
        "count": list(counted_text.values())
    })
    
    data.loc[data["count"] > 10,  "frequency"] = "frequent"
    data.loc[data["count"] <= 10, "frequency"] = "infrequent"
    data.loc[data["count"] == 1,  "frequency"] = "unique"
    
    data["length"] = data["word"].apply(len)
    
    sub_data = pd.DataFrame({
        "language": language,
        "frequency": ["frequent","infrequent","unique"],
        "mean_word_length": data.groupby(by = "frequency")["length"].mean(),
        "num_words": data.groupby(by = "frequency").size()
    })
    
    return(sub_data)
    
# write your code here!


### Exercise 6

In this exercise, we will plot our results and look for differences across each translation.

#### Instructions 
- Plot the word statistics of each translations on a single plot. Note that we have already done most of the work for you.
- Consider: do the word statistics differ by translation?

In [None]:
colors = {"Portuguese": "green", "English": "blue", "German": "red"}
markers = {"frequent": "o","infrequent": "s", "unique": "^"}
import matplotlib.pyplot as plt
for i in range(grouped_data.shape[0]):
    row = grouped_data.iloc[i]
    plt.plot(row.mean_word_length, row.num_words,
        marker=markers[row.frequency],
        color = colors[row.language],
        markersize = 10
    )

color_legend = []
marker_legend = []
for color in colors:
    color_legend.append(
        plt.plot([], [],
        color=colors[color],
        marker="o",
        label = color, markersize = 10, linestyle="None")
    )
for marker in markers:
    marker_legend.append(
        plt.plot([], [],
        color="k",
        marker=markers[marker],
        label = marker, markersize = 10, linestyle="None")
    )
plt.legend(numpoints=1, loc = "upper left")

plt.xlabel("Mean Word Length")
plt.ylabel("Number of Words")
# write your code to display the plot here!