# Setup

In [None]:
import pandas as pd
import numpy as np
import src.maude_interface as maude
import logging
import os

logging.basicConfig(level=logging.DEBUG)
logging.debug("Current working dir: " + str(os.getcwd()))
data_folder = "../../data/"
reference_folder = "../reference/"

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
stop = stopwords.words('english')

# Code used for Initial testing/development
## Compile Database to analyze (*deprecated* in favor of maude.compile_maude_database)

In [None]:
mdr_base = maude.read_mdr_file(data_folder + "mdrfoi-test.txt")
mdr_base.info()
mdr_base.head()

In [None]:
# Tests to prove merging data and making large database.
mdr_base = maude.maude_to_pandas(data_folder + "mdrfoi-test.txt")
#foidev_test = maude.maude_to_pandas("./data/foidev.txt")
mdr_total = maude.add_data_to_mdr(mdr_base, (data_folder + "foidev1998.txt", data_folder + "foidevproblem.txt"))
mdr_total = pd.merge(mdr_total, maude.maude_to_pandas(reference_folder + "deviceproblemcodes.txt"), how='left', on='DEVICE_PROBLEM_CODE')

## Save Database for later

In [None]:
mdr_total.to_pickle(data_folder + "mdrfoi_manual_compiled.pkl")
print(mdr_total.shape)
print(mdr_total.columns)
#print(mdr_total)

## Check if a CSV File is properly formatted

In [None]:
maude.check_bad_csv(data_folder + "excluded/foidev.txt")

In [None]:
foidev_dtype = {'MDR_REPORT_KEY': np.float32, 'DEVICE_PROBLEM_CODE': np.float32}
foidev_df = maude.maude_to_pandas(data_folder + "foidevproblem.txt", np.float32)
#print(foidev_df.dtypes)
print(foidev_df.info())
#foidev_df.apply(pd.to_numeric, errors='coerce', downcast='float')
#print(foidev_df.info())
print(foidev_df.isnull().sum())

In [None]:
del mdr_base
del mdr_total
del foidev_df

In [None]:
# Tries casting a column to categories. Used for classification, like hardware/software issue
problem_codes = maude.maude_to_pandas(reference_folder + "deviceproblemcodes.txt")
problem_codes['ERR_TYPE'] = problem_codes['ERR_TYPE'].astype('category')
print(problem_codes.describe())
print("\n\nProblem Code categories:\n" + str(problem_codes['ERR_TYPE'].value_counts()))

# PRODUCTION CODE
## New Way to Generate ENTIRE MAUDE Database

In [None]:
# alternative to all above statements except pickle
all_data = maude.compile_maude_database(data_folder, reference_folder, "mdrfoiThru2017.txt")

## Perform Learning/Analysis

In [None]:
all_text_data = maude.get_all_text_data(data_folder)
all_text_data.info()
print(all_text_data.shape)

Save All Text Data

In [None]:
all_text_data.to_pickle(data_folder + "all_text_data.pkl")

### Stem

In [None]:
# preprocessing
just_text = all_text_data['FOI_TEXT'].dropna()
from textblob import TextBlob

def stem_blob(input):
    if input is not np.nan:
        return TextBlob(" ").join((w.stem() for w in TextBlob(input).words.lower()))
    else:
        return input

just_text.apply(stem_blob)

### Save Stemmed Text

In [None]:
just_text.to_pickle(data_folder + "just_text_stemmed.pkl")

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
bag = count.fit_transform(just_text.values)
print(count.vocabulary_)

In [None]:
count2 = CountVectorizer(ngram_range=(2,2))
two_gram_bag = count2.fit_transform(just_text.values)
import operator
sorted_vocab = sorted(count2.vocabulary_.items(), key=operator.itemgetter(1))
print(sorted_vocab[-100:-1])

In [None]:
blb = TextBlob("the PHYSICIAN SHOCKED BY ELECTROSURGICAL PENCIL.")

In [None]:
stemmed = " ".join([w.stem() for w in blb.words.lower()])
print(stemmed)