# Semantic Analysis
Obtein relevant semantic measures.

### Preprocess

In [1]:
path_to_MyModule = '..'

import sys
sys.path.insert(0, path_to_MyModule) 

import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

import fasttext
from fasttext import util

from MyModule.SemanticAnalysisFunctions import *
from MyModule.GeneralFunctions import *
from MyModule.SummarizationFunctions import *

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Felipe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Felipe\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
df = pd.read_excel('..\datos.xlsx')[['ID','texto','desafio']]

In [3]:
# Limpieza
df.drop_duplicates(subset='texto', inplace=True)

# Quitando texto de mas en columna "desafio"
df['desafio'] = df['desafio'].apply(lambda x: re.findall('[0-9]+', x)[0])

# A str
df['texto'] = df['texto'].astype(str)

In [4]:
pp_object = Preprocess(lemma=False)
documents = df['texto'].values.tolist()
documents = pp_object.preprocess(documents)
documents = [d.split() for d in documents]

### Semantic Variability

In [5]:
# Load model
fasttext.util.download_model('es', if_exists='ignore')  # First time only
ft = fasttext.load_model('cc.es.300.bin')

words_variability = {}
for i, doc in enumerate(documents):
    total_similarity = []
    for i2, word in enumerate(doc[1:]):
        
        word_1 = ft.get_word_vector(word)#.reshape(1, -1)
        word_2 = ft.get_word_vector(doc[i2-1])#.reshape(1, -1)
        
        similarity = semantic_words_distance(word_1, word_2)
        total_similarity.append(similarity)
        
    words_variability[i] = ongoing_semantic_variability(total_similarity)



### Semantic Granularity

In [6]:
words_granularity = {}
for i, doc in enumerate(documents):
    total_granularity = []
    for i2, word in enumerate(doc):
        gran = get_word_granularity(word)
        if gran: total_granularity.append(gran)
    if len(total_granularity)>0:    
        words_granularity[i] = np.mean(total_granularity)
    else:
        words_granularity[i] = 0

### Df

In [7]:
df = pd.DataFrame({'sem_variability': words_variability, 'sem_granularity': words_granularity})
df

Unnamed: 0,sem_variability,sem_granularity
0,,0.000000
1,0.005862,9.500000
2,0.016918,4.500000
3,0.053762,5.166667
4,0.008742,6.750000
...,...,...
500,0.051103,4.500000
501,0.018819,3.333333
502,0.022065,6.000000
503,,9.000000


In [12]:
df.to_csv('./Datasets/df_semantic.csv', index=False)