In [None]:
"""

Filename: project_name_iteration_keyword.ipynb

Author:   Ednalyn C. De Dios
Phone:    (210) 236-2685
Email:    ednalyn.dedios@gmail.com

Created:  January 00, 2020
Updated:  January 00, 2020

PURPOSE: describe the purpose of this script.

PREREQUISITES: list any prerequisites or assumptions here.

DON'T FORGET TO:
1. Hydrate.
2. Sleep.
3. Have fun!

"""

# Set up Environment

In [None]:
import os

# manipulate dataframes
import pandas as pd

# natural language processing: n-gram ranking
import re
import unicodedata
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

# for natural language processing: named entity recognition
import spacy
from collections import Counter

# for natural language processing: sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
from nltk import word_tokenize, sent_tokenize

# add appropriate words that will be ignored in the analysis
ADDITIONAL_STOPWORDS = ['nan']

# for visualizations
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

# to print out all the outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# Read all files from a folder

In [None]:
def read_data(folder):
    '''
    This function reads each the raw data files as dataframes and
    combines them into a single data frame.
    '''
    for i, file_name in enumerate(os.listdir(input_folder)):
        try:
            # df = pd.read_excel(os.path.join(input_folder, file_name)) # excel
            # df = pd.read_csv(os.path.join(input_folder, file_name), sep='\t') # tsv file
            df = pd.read_csv(os.path.join(input_folder, file_name)) # vanilla csv
            df['file_name'] = file_name
            if i == 0:
                final_df = df.copy()
            else:
                final_df = final_df.append(df)

        except Exception as e:
            print(f"Cannot read file: {file_name}")
            print(str(e))
    return final_df

In [None]:
folder = 'G:/path/to/data/parent_folder_name'
df = read_data(folder)

# Show value counts

In [None]:
def show_values(df, columns):
    """
    Returns value counts of the specified columns.
    """
    for col in columns:
        print(str(col).upper())
        print('==================================================')
        print(df[col].value_counts(dropna=False))
        print('\n\n')

In [None]:
show_values(df, ['column', 'column', 'column'])

# Generate n-grams

In [None]:
def clean(text):
  """
  A simple function to clean up the data. All the words that
  are not designated as a stop word is then lemmatized after
  encoding and basic regex parsing are performed.
  """
  wnl = nltk.stem.WordNetLemmatizer()
  stopwords = nltk.corpus.stopwords.words('english')+ ADDITIONAL_STOPWORDS
  text = (unicodedata.normalize('NFKD', text)
    .encode('ascii', 'ignore')
    .decode('utf-8', 'ignore')
    .lower())
  words = re.sub(r'[^\w\s]', '', text).split()
  return [wnl.lemmatize(word) for word in words if word not in stopwords]

def get_words(df,column):
    """
    Takes in a dataframe and columns and returns a list of
    words from the values in the specified column.
    """
    return clean(''.join(str(df[column].tolist())))

def get_unigrams(words):
    """
    Takes in a list of words and returns a series of
    unigrams with value counts.
    """
    return  pd.Series(words).value_counts()

def get_bigrams(words):
    """
    Takes in a list of words and returns a series of
    bigrams with value counts.
    """
    return (pd.Series(nltk.ngrams(words, 2)).value_counts())[:40]

def get_trigrams(words):
    """
    Takes in a list of words and returns a series of
    trigrams with value counts.
    """
    return (pd.Series(nltk.ngrams(words, 3)).value_counts())[:40]

def get_qualgrams(words):
    """
    Takes in a list of words and returns a series of
    qualgrams with value counts.
    """
    return (pd.Series(nltk.ngrams(words, 4)).value_counts())[:40]

def get_ngrams(df,column):
    """
    Takes in a dataframe with column name and generates a
    dataframe of unigrams, bigrams, trigrams, and qualgrams.
    """
    return get_bigrams(get_words(df,column)).to_frame().reset_index().rename(columns={'index':'bigram','0':'count'}), \
           get_trigrams(get_words(df,column)).to_frame().reset_index().rename(columns={'index':'trigram','0':'count'}), \
           get_qualgrams(get_words(df,column)).to_frame().reset_index().rename(columns={'index':'qualgram','0':'count'})

def viz_bigrams(df,column):
    get_bigrams(get_words(df,column)).sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))

    plt.title('40 Most Frequently Occuring Bigrams')
    plt.ylabel('Bigram')
    plt.xlabel('# Occurances')

def viz_trigrams(df,column):
    get_trigrams(get_words(df,column)).sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))

    plt.title('40 Most Frequently Occuring Trigrams')
    plt.ylabel('Trigram')
    plt.xlabel('# Occurances')
    
def viz_qualgrams(df,column):
    get_bigrams(get_words(df,column)).sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))

    plt.title('40 Most Frequently Occuring Qualgrams')
    plt.ylabel('Qualgram')
    plt.xlabel('# Occurances')

# Generate dataframe from value counts

In [None]:
df_value_counts = df.column.value_counts(dropna=False)[:20].to_frame().reset_index().rename(columns={'index':'column', 'column':'count'})
df_value_counts

# Visualize value counts

In [None]:
df.column.value_counts(dropna=False)[:20].sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))
plt.title('Top 20 XXX')
plt.ylabel('XXX')
plt.xlabel('# Occurances')

# List top 10 values of a column based on another column

In [None]:
df_call = df.loc[(df['category_column'] == 'call') | (df['category_column'] == 'Call')]
df_chat = df.loc[(df['category_column'] == 'chat') | (df['category_column'] == 'Chat')]
df_email = df.loc[(df['category_column'] == 'email') | (df['category_column'] == 'Email')]

In [None]:
def list_top10(col_name, df_names):
    for df in df_names:
        print(df[col_name].value_counts(dropna=False)[:10].to_frame().reset_index().rename(columns={'index':col_name, col_name:'count'}))
        print('\n')

In [None]:
list_top10('column_name', [df_call, df_chat, df_email])

# Visualize top 10 values of a column based on another column

In [None]:
def viz_top10(col_name, df_names):
    for df in df_names:
        df.column_name.value_counts(dropna=False)[:10].sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))
        plt.title('Top 10 ' + col_name.upper()+'S')
        plt.ylabel(col_name)
        plt.xlabel('# Occurances')
        plt.show()

In [None]:
viz_top10('column_name', [df_call, df_chat, df_email])

In [None]:
df_call.shape[0]
df_chat.shape[0]
df_email.shape[0]