In [None]:
import spacy
import geopandas as geopd
import pandas as pd
import os
import fiona 
import shapely
from shapely.geometry import Point
from shapely.wkt import loads
import matplotlib.pyplot as plot

In [None]:
from collections import Counter
from spacy import displacy

In [None]:
pd.options.display.max_rows = 600
pd.options.display.max_colwidth = 400

In [None]:
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
import nltk
nltk.download()

In [None]:
%pip install requests
import requests

In [None]:
#%pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sa = SentimentIntensityAnalyzer()

In [None]:
#%pip install geograpy3
import geograpy
from geograpy import extraction
from geograpy import places

In [None]:
%pip install geopy
from geopy.geocoders import Nominatim
import time


El Corrido de Gregorio Cortez 
by Diane Lopez
Septemeber 8 2022
text analysis sentiment and gis project

In [None]:
#reading text file and creating a doc object by processing a string of text with the nlp object 
# Replace line breaks with spaces

#reading text file
gregText = open("corrido corpus\gregoriocortez_es_corrido.txt", encoding="utf-8").read()
# Replace line breaks with spaces
text_linebreaks = gregText.replace('\n', ' ')


In [None]:
import es_core_news_md 

nlp = spacy.load("es_core_news_md")

In [None]:
# Created by processing a string of text with the nlp object
doc = nlp(text_linebreaks)

# Iterate over tokens in a Doc
for token in doc:
   print(token.text, token.lemma_)

NER Named Entity Recognition

In [None]:
#NER with Long Texts or Many Texts
import math
number_of_chunks = 80

chunk_size = math.ceil(len(text) / number_of_chunks)

text_chunks = []

for number in range(0, len(text), chunk_size):
    text_chunk = text[number:number+chunk_size]
    text_chunks.append(text_chunk)

chunked_documents = list(nlp.pipe(text_chunks))
chunked_documents

In [None]:
places = []
for text in chunked_documents:
    for named_entity in document.ents:
        if named_entity.label_ == "LOC":
            places.append(named_entity.text)

places_tally = Counter(places)

df = pd.DataFrame(places_tally.most_common(), columns=['place', 'count'])
df

In [None]:
#Get NER in Context
from IPython.display import Markdown, display
import re

def get_ner_in_context(keyword, document, desired_ner_labels= False):
    
    if desired_ner_labels != False:
        desired_ner_labels = desired_ner_labels
    else:
        desired_ner_labels = ['PER', 'ORG', 'LOC']  
        
    #Iterate through all the sentences in the document and pull out the text of each sentence
    for sentence in document.sentences:
        #process each sentence
        sentence_doc = nlp(sentence.text)
        for named_entity in sentence_doc.ents:
            #Check to see if the keyword is in the sentence (and ignore capitalization by making both lowercase)
            if keyword.lower() in named_entity.text.lower()  and named_entity.label_ in desired_ner_labels:
                #Use the regex library to replace linebreaks and to make the keyword bolded, again ignoring capitalization
                #sentence_text = sentence.text
            
                sentence_text = re.sub('\n', ' ', sentence.text)
                sentence_text = re.sub(f"{named_entity.text}", f"**{named_entity.text}**", sentence_text, flags=re.IGNORECASE)

                display(Markdown('---'))
                display(Markdown(f"**{named_entity.label_}**"))
                display(Markdown(sentence_text))

In [None]:
for document in chunked_documents:
    get_ner_in_context('Laredo', document)

Named Entity Recongnition

In [None]:
#People
people = []

for document in chunked_documents:
    for named_entity in document.ents:
        if named_entity.label_ == "PER":
            people.append(named_entity.text)

people_tally = Counter(people)

gregPpl_df = pd.DataFrame(people_tally.most_common(), columns=['character', 'count'])
gregPpl_df

Part of Speech

In [None]:
for token in doc:
    print(token.lemma_, token.pos_, token.dep_)

Keyword Extration

In [None]:
import re
from IPython.display import Markdown, display

In [None]:
def find_sentences_with_keyword(keyword, doc):
    
    #Iterate through all the sentences in the document and pull out the text of each sentence
    for sentence in doc.sents:
        sentence = sentence.text
        
        #Check to see if the keyword is in the sentence (and ignore capitalization by making both lowercase)
        if keyword.lower() in sentence.lower():
            
            #Use the regex library to replace linebreaks and to make the keyword bolded, again ignoring capitalization
            sentence = re.sub('\n', ' ', sentence)
            sentence = re.sub(f"{keyword}", f"**{keyword}**", sentence, flags=re.IGNORECASE)

            display(Markdown(sentence))
            

In [None]:
find_sentences_with_keyword(keyword="Laredo", doc=doc)


In [None]:
#Make a list of tokens and POS labels from document if the token is a word 
tokens_and_labels = [(token.text, token.pos_) for token in doc if token.is_alpha]

In [None]:
#Make a function to get all two-word combinations
def get_bigrams(word_list, number_consecutive_words=2):
    
    ngrams = []
    adj_length_of_word_list = len(word_list) - (number_consecutive_words - 1)
    
    #Loop through numbers from 0 to the (slightly adjusted) length of your word list
    for word_index in range(adj_length_of_word_list):
        
        #Index the list at each number, grabbing the word at that number index as well as N number of words after it
        ngram = word_list[word_index : word_index + number_consecutive_words]
        
        #Append this word combo to the master list "ngrams"
        ngrams.append(ngram)
        
    return ngrams

In [None]:
bigrams = get_bigrams(tokens_and_labels)

In [None]:
bigrams[5:20]

In [None]:
def get_neighbor_words(keyword, bigrams, pos_label = None):
    
    neighbor_words = []
    keyword = keyword.lower()
    
    for bigram in bigrams:
        
        #Extract just the lowercased words (not the labels) for each bigram
        words = [word.lower() for word, label in bigram]        
        
        #Check to see if keyword is in the bigram
        if keyword in words:
            
            for word, label in bigram:
                
                #Now focus on the neighbor word, not the keyword
                if word.lower() != keyword:
                    #If the neighbor word matches the right pos_label, append it to the master list
                    if label == pos_label or pos_label == None:
                        neighbor_words.append(word.lower())
    
    return Counter(neighbor_words).most_common()

In [None]:
get_neighbor_words("Cortez", bigrams)


In [None]:
greg_df = pd.read_fwf('log.csv')
greg_df


In [None]:
%pip install --user mordecai

In [None]:
import mordecai
from mordecai import Geopareser 
