In [None]:
import re
import os
import json
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from wordcloud import WordCloud
plt.rcParams['figure.dpi'] = 80
plt.rcParams['savefig.dpi'] = 300

## Lista de keywords de um determinado ano

In [None]:
f = open("keywords_all_year.json")
all_text_year = list(json.load(f))
keywords_year = {}
text_quant_year = {}

for a in all_text_year:
    if a['year'] not in keywords_year.keys():
        keywords_year[a['year']] = a['keywords']
        text_quant_year[a['year']] = 1
    elif  a['year'] in keywords_year.keys():
        text_quant_year[a['year']] += 1
        for b in a['keywords']:
            keywords_year[a['year']].append(b)

## Lista de keywords de acordo com ano e tipo

In [None]:
f = open("keywords_all_year.json")
all_text_year = list(json.load(f))
keywords_yearNtype = {}
text_quant_yearNtype = {}

for c in all_text_year:
    conj_value = c['year'] + '_' + c['text_type']
    if conj_value not in keywords_yearNtype.keys():
        keywords_yearNtype[conj_value] = c['keywords']
        text_quant_yearNtype[conj_value] = 1
    elif  conj_value in keywords_yearNtype.keys():
        text_quant_yearNtype[conj_value] += 1
        for d in c['keywords']:
            keywords_yearNtype[conj_value].append(d)

## Lista de keywords de acordo com tipo

In [None]:
f = open("keywords_all_year.json")
all_text_year = list(json.load(f))
keywords_type = {}
text_quant_type = {}

for e in all_text_year:
    if e['text_type'] not in keywords_type.keys():
        keywords_type[e['text_type']] = e['keywords']
        text_quant_type[e['text_type']] = 1
    elif  e['text_type'] in keywords_type.keys():
        text_quant_type[e['text_type']] += 1
        for f in e['keywords']:
            keywords_type[e['text_type']].append(f)

## Grafico de linhas de plano de negócio e monografia por ano

In [None]:
data1 = {'year': ['2021','2020','2019','2018','2017','2016'], 'plano': [], 'monografia': []}
valor1 = 0
valor2 = 0

for i in text_quant_yearNtype:
    valor1 = text_quant_yearNtype[i]
    for j in text_quant_yearNtype:
        if j[j.find('_')+1:] != i[i.find('_')+1:] and j[:j.find('_')] == i[:i.find('_')]:
            valor2 = text_quant_yearNtype[j]
    data1[i[i.find('_')+1:]].append(valor1/(valor1+valor2))
    
data1['year'].reverse()
data1['plano'].reverse()
data1['monografia'].reverse()

data_preproc = pd.DataFrame(data1)
fig = sns.lineplot(x='year', y='value', hue='variable', 
             data=pd.melt(data_preproc, ['year']), marker="o")

fig.set(xlabel='Ano', ylabel='Percentual', ylim=(0, 1))

## Grafico de quantidade de TCCs por ano

In [None]:
data2 = {'year': ['2021','2020','2019','2018','2017','2016'], 'tcc': []}
valor1 = 0
valor2 = 0

for j in data2['year']:
    data2['tcc'].append(text_quant_year[j])
    
data2['year'].reverse()
data2['tcc'].reverse()
    
data_preproc = pd.DataFrame(data2)
sns.barplot(x='year', y='value', hue='variable', 
             data=pd.melt(data_preproc, ['year']))

print(data2['tcc'])

## Nuvem de palavras

In [None]:
ano = '2021' #Definir ano
text =  " ".join(keywords_year[ano])
text = text.replace(" p ", " ")
# Arredondar
x, y = np.ogrid[:300, :300]
mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
mask = 255 * mask.astype(int)

wordcloud = WordCloud().generate(text)

wordcloud = WordCloud(mask=mask, background_color="white",max_words=len(keywords_year[ano]),max_font_size=40, relative_scaling=.5).generate(text)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

## Principais palavras de cada ano

In [None]:
blocked_words = ['unk', '<', '>']
word_count_year = {}
for i in keywords_year:
    word_count_year[i] = {}
    for j in keywords_year[i]:
        if j not in word_count_year[i] and j not in blocked_words:
            word_count_year[i][j] = 1
        elif j in word_count_year[i] and j not in blocked_words:
            word_count_year[i][j] += 1

In [None]:
most_used_word_year = {}
for k in keywords_year:
    word_count_year[k] = dict(sorted(word_count_year[k].items(), key=lambda item: item[1], reverse=True))
    most_used_word_year[k] = [list(word_count_year[k].keys())[0], list(word_count_year[k].values())[0]/text_quant_year[k]]

In [None]:
data4 = {'year': ['2021','2020','2019','2018','2017','2016'], 'most used word': [], 'percentage': []}

for j in data4['year']:
    data4['most used word'].append(most_used_word_year[j][0])
    data4['percentage'].append(most_used_word_year[j][1])
    
data4['year'].reverse()
data4['most used word'].reverse()
data4['percentage'].reverse()

palette = ['r']
    
df = pd.DataFrame(data4)
df

## Principais palavras de cada ano excluindo a palavra empresa e ano

In [None]:
blocked_words = ['unk', '<', '>', 'empresa', 'ano']
word_count_year2 = {}
for i in keywords_year:
    word_count_year2[i] = {}
    for j in keywords_year[i]:
        if j not in word_count_year2[i] and j not in blocked_words:
            word_count_year2[i][j] = 1
        elif j in word_count_year2[i] and j not in blocked_words:
            word_count_year2[i][j] += 1

most_used_word_year2 = {}
for k in keywords_year:
    word_count_year2[k] = dict(sorted(word_count_year2[k].items(), key=lambda item: item[1], reverse=True))
    most_used_word_year2[k] = [list(word_count_year2[k].keys())[0], list(word_count_year2[k].values())[0]/text_quant_year[k]]

In [None]:
data6 = {'year': ['2021','2020','2019','2018','2017','2016'], 'most used word': [], 'percentage': []}

for j in most_used_word_year2:
    data6['most used word'].append(most_used_word_year2[j][0])
    data6['percentage'].append(most_used_word_year2[j][1])
    
data6['year'].reverse()
data6['most used word'].reverse()
data6['percentage'].reverse()

palette = ['r']
    
df2 = pd.DataFrame(data6)
df2

## Principais palavras de cada ano por tipo de TCC excluindo a palavra 'empresa'

In [None]:
blocked_words = ['unk', '<', '>', 'ano', 'empresa', 'fonte']
word_count_year3 = {}

for i in keywords_yearNtype:
    word_count_year3[i] = {}
    for j in keywords_yearNtype[i]:
        if j not in word_count_year3[i] and j not in blocked_words:
            word_count_year3[i][j] = 1
        elif j in word_count_year3[i] and j not in blocked_words:
            word_count_year3[i][j] += 1

most_used_word_year3 = {}
for k in keywords_yearNtype:
    word_count_year3[k] = dict(sorted(word_count_year3[k].items(), key=lambda item: item[1], reverse=True))
    most_used_word_year3[k] = [list(word_count_year3[k].keys())[0], list(word_count_year3[k].values())[0]/text_quant_yearNtype[k]]


In [None]:
data5 = {'year': [], 'TCC type': [], 'most used word': [], 'percentage': []}

for j in most_used_word_year3:
    year_temp = j[:j.find('_')]
    type_temp = j[j.find('_')+1:]
    data5['year'].append(year_temp)
    data5['TCC type'].append(type_temp)
    data5['most used word'].append(most_used_word_year3[j][0])
    data5['percentage'].append(most_used_word_year3[j][1])
    
data5['year'].reverse()
data5['TCC type'].reverse()
data5['most used word'].reverse()
data5['percentage'].reverse()

palette = ['r']
    
df = pd.DataFrame(data5)
df

## Principais palavras de cada ano por tipo de texto

In [None]:
blocked_words = ['unk', '<', '>']
word_count_year3 = {}

for i in keywords_yearNtype:
    word_count_year3[i] = {}
    for j in keywords_yearNtype[i]:
        if j not in word_count_year3[i] and j not in blocked_words:
            word_count_year3[i][j] = 1
        elif j in word_count_year3[i] and j not in blocked_words:
            word_count_year3[i][j] += 1

most_used_word_year3 = {}
for k in keywords_yearNtype:
    word_count_year3[k] = dict(sorted(word_count_year3[k].items(), key=lambda item: item[1], reverse=True))
    most_used_word_year3[k] = [list(word_count_year3[k].keys())[0], list(word_count_year3[k].values())[0]/text_quant_yearNtype[k]]

In [None]:
data5 = {'year': [], 'TCC type': [], 'most used word': [], 'percentage': []}

for j in most_used_word_year3:
    year_temp = j[:j.find('_')]
    type_temp = j[j.find('_')+1:]
    data5['year'].append(year_temp)
    data5['TCC type'].append(type_temp)
    data5['most used word'].append(most_used_word_year3[j][0])
    data5['percentage'].append(most_used_word_year3[j][1])
    
data5['year'].reverse()
data5['TCC type'].reverse()
data5['most used word'].reverse()
data5['percentage'].reverse()

palette = ['r']
    
df = pd.DataFrame(data5)
df

## Uso de determinada palavra ao longo dos anos

In [None]:
choosen_word = 'marketing' #Definir a palavra
word_count_year3 = {}
data3 = {'year': ['2021','2020','2019','2018','2017','2016'], choosen_word: []}

for i in keywords_year:
    word_count_year3[i] = {}
    for j in keywords_year[i]:
        if j not in word_count_year3[i] and j == choosen_word:
            word_count_year3[i][j] = 1
        elif j in word_count_year3[i] and j == choosen_word:
            word_count_year3[i][j] += 1

for l in word_count_year3:
    try:
        data3[choosen_word].append(list(word_count_year3[l].values())[0]/text_quant_year[l])
    except IndexError:
        data3[choosen_word].append(0)
    
data3['year'].reverse()
data3[choosen_word].reverse()

data_preproc = pd.DataFrame(data3)
fig = sns.lineplot(x='year', y='value', hue='variable', 
             data=pd.melt(data_preproc, ['year']), marker="o")

fig.set(xlabel='Ano', ylabel='Percentual', ylim=(0, 1))
print(data3[choosen_word])

## Proporção de trabalhos em lingua não portuguesa

In [None]:
non_pt = {'2021': 0, '2020': 5, '2019': 1, '2018': 0, '2017': 1, '2016': 3}
data7 = {'year': ['2021','2020','2019','2018','2017','2016'], 'TCCs em lingua estrangeira': []}

for i in text_quant_year:
    data7['TCCs em lingua estrangeira'].append(non_pt[i]/(non_pt[i]+text_quant_year[i]))
    
    
data7['year'].reverse()
data7['TCCs em lingua estrangeira'].reverse()

data_preproc = pd.DataFrame(data7)
fig = sns.lineplot(x='year', y='value', hue='variable', 
             data=pd.melt(data_preproc, ['year']), marker="o")

fig.set(xlabel='Ano', ylabel='Percentual', ylim=(0, 1))
print(data7['TCCs em lingua estrangeira'])