In [1]:
import pandas as pd

In [2]:
responses = pd.read_csv("Spreadsheets/A.tsv", sep='\t')

# Import training data - CANNOT match your target data
responses_train = pd.read_csv("Spreadsheets/B.tsv", sep='\t')

# Define global constant, class size
CLASS_SIZE = 45

# Define global constants: WORDS and BIGRAMS
WORDS = 20
BIGRAMS = 1000

# Define global constants for parts of speech
NOUN = 'N'
ADJ = 'J'

<h2>Find Response Counts for Each of the Sections</h2>

In [3]:
response_rates = responses.groupby('Section').count()
# response_rates.loc['Total'] = response_rates.sum()
# response_rates['Max'] = response_rates.max(axis=1)
response_rates

Unnamed: 0_level_0,Unnamed: 0,Question1,Question2,Question3
Section,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,22,22,16,12
2,31,30,27,18
3,32,31,27,25
4,28,28,24,18


<h2>Response Rate by Column</h2>

In [4]:
# Find out how many classes by counting rows in the 'Section' column
num_sections = response_rates.shape[0]
total_students = CLASS_SIZE * num_sections

# All responses [column] -> [string list]
# not pd.isnull(...) -> ignore NaN fields
# List of response strings by columns: Q1..Q3
q1_feedback = responses['Question1'].tolist()
q1_feedback = [w for w in q1_feedback if not pd.isnull(w)]
q2_feedback = responses['Question2'].tolist()
q2_feedback = [w for w in q2_feedback if not pd.isnull(w)]
q3_feedback = responses['Question3'].tolist()
q3_feedback = [w for w in q3_feedback if not pd.isnull(w)]

q1_responses = len(q1_feedback)
q2_responses = len(q2_feedback)
q3_responses = len(q3_feedback)

# Which question had the most responses?
max_responses = max(q1_responses, q2_responses, q3_responses)

# Find response rates for all questions
max_resp_rate = round((max_responses / total_students),4)
q1_response_rate = round((q1_responses / total_students), 4)
q2_response_rate = round((q2_responses / total_students), 4)
q3_response_rate = round((q3_responses / total_students), 4)

print('Q1 responses: %d'% q1_responses, 'Q2 responses: %d' % q2_responses,
     'Q3 responses: %d' % q3_responses, 'Q1 response rate: %f' % q1_response_rate,
      'Q2 response rate: %f' % q2_response_rate,
      'Q3 response rate: %f' % q3_response_rate,
      'Response Rate (responded to >= 1 qestion): %f'
      % max_resp_rate, sep='\n')

Q1 responses: 111
Q2 responses: 94
Q3 responses: 73
Q1 response rate: 0.616700
Q2 response rate: 0.522200
Q3 response rate: 0.405600
Response Rate (responded to >= 1 qestion): 0.616700


<h2>Positive/Negative Response Word Counts & Word Lengh</h2>

In [5]:
from nltk.tokenize import RegexpTokenizer
import numpy as np

tokenizer = RegexpTokenizer(r'\w+')




# Find average character counts for responses
q1_char_cnts = [len(w) for w in q1_feedback]
q2_char_cnts = [len(w) for w in q2_feedback]
q1_char_avg = np.mean(q1_char_cnts)
q2_char_avg = np.mean(q2_char_cnts)

# Find average word counts for responses
q1_wdcnts = [len(tokenizer.tokenize(w)) for w in q1_feedback]
q2_wdcnts = [len(tokenizer.tokenize(w)) for w in q2_feedback]
q1_wdcnt_avg = np.mean(q1_wdcnts)
q2_wdcnt_avg = np.mean(q2_wdcnts)


# Show results
print("Total Q1 word count: %d" % sum(q1_wdcnts))
print("Avg. Q1 comment word count: %f" % q1_wdcnt_avg)
print("Avg. Q1 word length: %f" % (q1_char_avg / q1_wdcnt_avg))
print()
print("Total Q2 word count: %d" % sum(q2_wdcnts))
print("Avg. Q2 comment word count: %f" % q2_wdcnt_avg)
print("Avg. Q2 word length: %f" % (q2_char_avg / q2_wdcnt_avg))

Total Q1 word count: 1787
Avg. Q1 comment word count: 16.099099
Avg. Q1 word length: 6.574706

Total Q2 word count: 2097
Avg. Q2 comment word count: 22.308511
Avg. Q2 word length: 6.567477


<h1>Create Training Data</h1>
<p>***This data should NOT be the same as the file in question</p>

In [6]:
q1_train = responses_train['Question1'].tolist()
q1_train = [w for w in q1_train if not pd.isnull(w)]

q2_train = responses_train['Question2'].tolist()
q2_train = [w for w in q2_train if not pd.isnull(w)]


<h1>Create Strings from Feedback Columns</h1>

In [7]:
# Convert response lists to 2 long strings
q1_string = " ".join(q1_feedback)
q2_string = " ".join(q2_feedback)

# Training data, too
q1_train_string = " ".join(q1_train)
q2_train_string = " ".join(q2_train)

<h1>Part of Speech Tagging</h1>

In [8]:
import nltk
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import PunktSentenceTokenizer

# Train on training string
# Create tokenizer for q1 and q2
q1_sent_tokenizer = PunktSentenceTokenizer(q1_train_string)
q2_sent_tokenizer = PunktSentenceTokenizer(q2_train_string)


#### Q1

# Tokenize and then create part of speech tags - [(word, POS), ...]
q1_tokenized = q1_sent_tokenizer.tokenize(q1_string)
q1_tagged = []

# Append tagged tuples to q1_tagged
for w in q1_tokenized:
    words = nltk.word_tokenize(w)
    tagged = nltk.pos_tag(words)
    q1_tagged.append(tagged)

# Collapse the 2-D list into 1-D
q1_tagged = [tup for sent in q1_tagged for tup in sent]
   

    
#### Q2

# Tokenize and then create part of speech tags - [(word, POS), ...]
q2_tokenized = q2_sent_tokenizer.tokenize(q2_string)
q2_tagged = []

# Append tagged tuples to q2_tagged
for w in q2_tokenized:
    words = nltk.word_tokenize(w)
    tagged = nltk.pos_tag(words)
    q2_tagged.append(tagged)
    
# Collapse 2-D list into 1-D
q2_tagged = [tup for sent in q2_tagged for tup in sent]

<h1>Find 10 Most Frequent Nouns and Adjectives for Q1 and Q2</h1>

In [9]:
# Q1

# Filter target parts of speech
q1_nouns = [item[0] for item in q1_tagged if item[1][0] == NOUN]
q1_adjs = [item[0] for item in q1_tagged if item[1][0] == ADJ]


# Find the 10 most frequently occuring words from each part of speech
q1_noun_fdist = FreqDist(q1_nouns)
q1_adj_fdist = FreqDist(q1_adjs)

# Find 20 most common nouns & adjectives and their frequencies
q1_common_nouns = q1_noun_fdist.most_common(WORDS)
q1_common_adjs = q1_adj_fdist.most_common(WORDS)

print('\n\nQ1: ', '\n\nNouns: ', q1_common_nouns, '\n\nAdjectives: ',
      q1_common_adjs)


# Q2

# Filter target parts of speech
q2_nouns = [item[0] for item in q2_tagged if item[1][0] == NOUN]
q2_adjs = [item[0] for item in q2_tagged if item[1][0] == ADJ]

# Find the 10 most frequently occuring words from each part of speech
q2_noun_fdist = FreqDist(q2_nouns)
q2_adj_fdist = FreqDist(q2_adjs)

# Find 20 most common nouns & adjectives and their frequencies
q2_common_nouns = q2_noun_fdist.most_common(WORDS)
q2_common_adjs = q2_adj_fdist.most_common(WORDS)

print('\n\nQ2: ', '\n\nNouns: ', q2_common_nouns, '\n\nAdjectives: ',
      q2_common_adjs)


# Make sets out of the most common words
q1_word_set = set()
for i in range(WORDS):
    q1_word_set.add(q1_common_nouns[i][0])
    q1_word_set.add(q1_common_adjs[i][0])

q2_word_set = set()
for i in range(WORDS):
    q2_word_set.add(q2_common_nouns[i][0])
    q2_word_set.add(q2_common_adjs[i][0])




Q1:  

Nouns:  [('Sed', 41), ('lacinia', 25), ('sem', 25), ('vel', 25), ('lectus', 25), ('Nulla', 24), ('quis', 24), ('tortor', 22), ('quam', 22), ('diam', 22), ('ante', 22), ('mattis', 21), ('cursus', 21), ('Proin', 19), ('non', 19), ('ipsum', 19), ('Morbi', 18), ('ligula', 18), ('Vestibulum', 18), ('suscipit', 18)] 

Adjectives:  [('Aenean', 16), ('ut', 15), ('iaculis', 15), ('nunc', 9), ('scelerisque', 8), ('aliquet', 8), ('urna', 8), ('ipsum', 7), ('tellus', 6), ('pede', 5), ('pharetra', 5), ('interdum', 5), ('augue', 4), ('accumsan', 4), ('nec', 3), ('sem', 3), ('ullamcorper', 3), ('nibh', 3), ('luctus', 2), ('amet', 2)]


Q2:  

Nouns:  [('Sed', 53), ('Nulla', 33), ('ante', 32), ('et', 31), ('quis', 30), ('quam', 26), ('Morbi', 25), ('ultrices', 24), ('non', 23), ('tortor', 22), ('ipsum', 22), ('Integer', 22), ('augue', 22), ('Vestibulum', 22), ('Ut', 22), ('diam', 22), ('lacinia', 21), ('cursus', 21), ('vel', 20), ('Curabitur', 20)] 

Adjectives:  [('ut', 16), ('iaculis', 13),

<h1>Finding Keywords</h1>

In [10]:
from rake_nltk import Rake
import re
from string import punctuation

# Q1
q1_rake = Rake()

q1_rake.extract_keywords_from_text(q1_string)
q1_keywords = q1_rake.get_ranked_phrases()
new_keywords = []

r = re.compile(r'[{}]'.format(punctuation))
for s in q1_keywords:
    new_strs = r.sub(' ', s)
    if 2 <= len(new_strs.split()) <= 4:
        new_keywords.append(s)

print(new_keywords)
        
# Q2
q2_rake = Rake()

q2_rake.extract_keywords_from_text(q2_string)
q2_keywords = q2_rake.get_ranked_phrases()




['class aptent taciti', 'metus vitae pharetra auctor', 'integer lacinia sollicitudin massa', 'per ince gestas porttitor', 'sed dignissim lacinia nunc', 'vestibulum ante ipsum primis', 'sodales libero eget ante', 'cursus ipsum ante rci', 'vestibulum lacinia accumsan porttitor', 'curabitur sit amet mauris', 'quisque volutpat condimentum velit', 'purus al les ligula', 'tortor neque adipiscing diam', 'sed aliq nostra', 'urna non tincidunt mattis', 'su nam nec ante', 'ut ultrices ultrices enim', 'praesent mauri aliquet eget', 'donec lacus nun ue', 'lorem ipsum dolor imperdiet', 'sed pretium blandit orci', 'maec ia molestie dui', 'augue congue elementum', 'donec acinia nunc', 'maecenas aliquet mollis lectus', 'ultrices sit amet', 'aliq nvallis tristique sem', 'sed convallis tristique sem', 'curabitur tor enean quam', 'lorem dapibus diam', 'tellus consequat imperdiet', 'per inceptos himenaeos', 'sem massa mattis sem', 'donec lacus nunc', 'lacinia molestie dui', 'nunc feugiat mi', 'pede suscip

<h1>Finding Bigrams and Collocations</h1>

<p>Trigrams did not end up being very informative</p>

In [11]:
from nltk.collocations import *

# get rid of punctuation
punct = {',', '.', ';', '-', '!'}
q1_tagged_nopunct = [tup for tup in q1_tagged if tup[0] not in punct]
q2_tagged_nopunct = [tup for tup in q2_tagged if tup[0] not in punct]

stop_words = set(stopwords.words('english'))

bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
bi_finder = BigramCollocationFinder.from_words(q1_tagged_nopunct)
tri_finder = TrigramCollocationFinder.from_words(q1_tagged_nopunct)

# # Find trigrams/bigrams
# tri_finder.nbest(trigram_measures.pmi, 40)
# bi_finder.nbest(bigram_measures.pmi, 40)

# Find most frequent bigrams/trigrams
bi_finder.apply_freq_filter(2)
# bi_finder.nbest(bigram_measures.pmi, 100)

tri_finder.apply_freq_filter(2)
# tri_finder.nbest(trigram_measures.pmi, 100)


# Filter out stopwords - 1000, should be longer than the max # of bigrams
bi_finder.apply_word_filter(lambda w: w in stop_words)
bigrams = bi_finder.nbest(bigram_measures.pmi, BIGRAMS)

for i in range(len(bigrams)):
    if bigrams[i][0][0] in q1_word_set or bigrams [i][1][0] in q1_word_set:
        print('(', bigrams[i][0][0], bigrams[i][1][0], ')')

( eu pede )
( pulvinar ullamcorper )
( lacinia sollicitudin )
( risus accumsan )
( sit amet )
( auctor sem )
( interdum magna )
( pharetra auctor )
( augue semper )
( nec tellus )
( tellus consequat )
( sed augue )
( tellus sed )
( ante ipsum )
( ipsum primis )
( viverra nec )
( pede facilisis )
( Nam nec )
( amet pede )
( aliquet mollis )
( nunc egestas )
( sit amet )
( In scelerisque )
( Pellentesque nibh )
( Fusce nec )
( nibh elementum )
( amet augue )
( augue congue )
( magna augue )
( sodales ligula )
( sem massa )
( vitae pharetra )
( accumsan porttitor )
( ipsum sit )
( pede Ut )
( Nulla metus )
( metus ullamcorper )
( Integer lacinia )
( Vestibulum ante )
( pede suscipit )
( Vestibulum sapien )
( ipsum Nulla )
( sapien Proin )
( urna non )
( fermentum non )
( orci aliquet )
( ut felis )
( nunc Curabitur )
( eget ante )
( quam Etiam )
( ullamcorper Nulla )
( eu diam )
( dapibus diam )
( tortor Lorem )
( tortor Pellentesque )
( aliquet Mauris )
( ipsum Praesent )
( nibh Aenean )