In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
import json
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import nltk # imports the natural language toolkit
nltk.download('punkt')
nltk.download('stopwords')
import string
import plotly
import plotly.express as px
from nltk.stem import PorterStemmer 
from collections import Counter
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.data import load
from nltk.stem import SnowballStemmer
from string import punctuation
from sklearn.feature_extraction.text import CountVectorizer
import unicodedata
import re
from nltk.tag import StanfordPOSTagger
# Aquí obtenemos la lista de tokens en "tokens"
tagger="/home/ec2-user/stanford-tagger-4.0.0/models/spanish-ud.tagger"
jar="/home/ec2-user/stanford-tagger-4.0.0/stanford-postagger.jar"

In [None]:
%%time
df = pd.read_excel('Original_Data/Reporte Uraba2019_CAGMV1Est.xlsx')
with open('./GeoData/munis.geojson', encoding='utf-8') as geo:
    geojson = json.loads(geo.read())
df.head()

In [None]:
# df["RepairCode"].value_counts()
# null_rc = df[df["RepairCode"].isnull()]
# len(df["RepairCode"])
list(df.columns)


In [None]:

#priority_col_dict
#Mejorar los colores, función para cada RGB
df.Latitude = df.Latitude/1000000
df.Longitude = df.Longitude/1000000
df.Priority.unique()
priority_colors = ['#%02x%02x%02x' % (255, 0+(i*30), 0) for i in range(len(df.Priority.unique()))]
priority_colors = list(reversed(priority_colors))
priority_col_dict = dict(zip(df.Priority.unique(),priority_colors[-1::-1]))

#Center in Apartadó, Antioquia: (7.88299, -76.62587)
antioquia_map2 = folium.Map(location=[7.88299, -76.62587],
                        zoom_start=9,
                        tiles="OpenStreetMap")

for i in range(0,len(df)):
    marker = folium.CircleMarker(location=[ df["Latitude"][i], df["Longitude"][i] ],
                                 radius=2,
                                 color= priority_col_dict[df.Priority[i]],
                                 fill=True)
    marker.add_to(antioquia_map2)


antioquia_map2

In [None]:
%%time
# df.town.unique()
df['town_upper'] = df.town.apply(lambda x: str(x).upper())
# df.town_lower.unique()

dff = df.groupby('town_upper').mean().reset_index()
dff

px.choropleth_mapbox(dff,                          #Data
        locations='town_upper',                    #Column containing the identifiers used in the GeoJSON file 
        color='Priority',                          #Column giving the color intensity of the region
        geojson=geojson,                           #The GeoJSON file
        featureidkey="properties.MPIO_CNMBR",
        zoom=5,                                    #Zoom
        mapbox_style="carto-positron",             #Mapbox style, for different maps you need a Mapbox account and a token
        center={"lat": 7.88299, "lon": -76.62587}, #Center
        color_continuous_scale="Viridis",          #Color Scheme
        opacity=0.5,                               #Opacity of the map
        )    



In [None]:
df["RepairCodeString"] = df["RepairCode"].apply(lambda x: str(x).upper())

text = " ".join(list(df["RepairCodeString"].unique()))
# text
# eliminate irrelevant words
irrelevant = ["RAMALES", " DE ", " EN ",  " POR ", " O " , " Y ", "/" ," PARA ", "(", ")", " - ", " -",  "- " ]

for st in irrelevant:
    text = text.replace(st," ")

# eliminate verbs
for wo in text.split(" "):
    word = str(wo).strip()
    
    if word.endswith("AR") or word.endswith("ER") or word.endswith("IR"):
        text = text.replace(word," ")
    else : print(word)
# Create and generate a word cloud image:
wordcloud = WordCloud().generate(text)

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# df["RepairCodeString"]

# Text analysis RepairCode


## First we extract a text and clean it

To do that, remove stopwords, NAN, RAMAL, RAMALES

In [None]:
es_stopwords = [str(x).upper() for x in stopwords.words("spanish")] 

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

es_stopwords_na = [remove_accents(x) for x in es_stopwords]
es_stopwords_na.extend(["NAN", "RAMAL", "RAMALES"])

def clean_text(text):
    # Remove non alphabetic charactets using pattern = r"[^\w]" seen in class
    
    pattern = r"[^\w]"
    ret = re.sub(pattern, " ", text)
    ret = remove_accents(ret)
    for bad in es_stopwords_na:
        
        to_replace = " " + bad + " " if bad != "NAN" else bad
        ret = ret.replace(to_replace, " ")
    return ret
    
# Create clean column    
df["RepairCodeStringClean"] = df["RepairCodeString"].apply(clean_text)
    
    

all_reviews_text = ' '.join(df["RepairCodeString"])
all_reviews_text = clean_text(all_reviews_text)

print(all_reviews_text)


# Get tokens
tokenized_words = nltk.word_tokenize(all_reviews_text)
# remove length smaller than 2
tokenized_words = [each.strip() for each in tokenized_words if len(each.lower()) > 2]


word_freq = Counter(tokenized_words)
ten_pct =round(len(word_freq)*0.1)

## Top 10%
word_freq.most_common(ten_pct)

## Similarly, bottom 10%
word_freq.most_common()[-ten_pct:-1]

df["RepairCodeStringClean"].apply(lambda x: np.nan if str(x).strip() == "" else x).dropna().head()

## First 5 repair codes n-grams
# first_5_revs = AllRCs[0:5]
# word_tokens = nltk.word_tokenize(''.join(first_5_revs))
# list(ngrams(word_tokens, 3)) #ngrams(word_tokens,n) gives the n-grams.

# N-Grams RepairCode

In [None]:
def top_k_ngrams(word_tokens,n,k):
    ## Getting them as n-grams
    n_gram_list = list(ngrams(word_tokens, n))
    ### Getting each n-gram as a separate string
    n_gram_strings = [' '.join(each) for each in n_gram_list]
    n_gram_counter = Counter(n_gram_strings)
    most_common_k = n_gram_counter.most_common(k)
    print(most_common_k)

In [None]:
top_k_ngrams(tokenized_words, 1, 10)

In [None]:
top_k_ngrams(tokenized_words, 2, 10)

In [None]:
top_k_ngrams(tokenized_words, 3, 10)

In [None]:
top_k_ngrams(tokenized_words, 4, 10)

In [None]:
# nltk.pos_tag(tokenized_words)
# import spaghetti as sgt

# sent1 = 'Mi colega me ayuda a programar cosas .'.split()
# sent2 = 'Está embarazada .'.split()
# test_sents = [sent1, sent2]

# # Default Spaghetti tagger.
# print (sgt.pos_tag(test_sent))

# # Tag multiple sentences.
# print (sgt.pos_tag_sents(test_sents))

# spa_tagger = sgt.CESSTagger()
# # POS tagger trained on unigrams of CESS corpus.
# spa_unigram_tagger = spa_tagger.uni
# print (spa_unigram_tagger.tag(sent1))
# # POS tagger traned on bigrams of CESS corpus.
# spa_bigram_tagger = spa_tagger.bi
# print (spa_bigram_tagger.tag(sent2))
# print (spa_bigram_tagger.tag_sents(test_sents))

In [None]:
# # Now lets PoD tag everything
# etiquetador=StanfordPOSTagger(tagger,jar)
# etiquetas=etiquetador.tag(tokenized_words)
# etiquetas