####**TOKENIZAÇÃO DO TEXTO EM SENTENÇAS E CONTAGEM DE SENTENÇAS**

In [None]:
import nltk
import re

from google.colab import drive
drive.mount('/content/drive')

with open('/content/drive/MyDrive/Classificador/texto_natural/Graciliano.txt', 'r') as f:
  texto = f.read()

texto_limpo = re.sub(r'\s+', ' ', texto)

nltk.download('punkt_tab')

sentencas = nltk.sent_tokenize(texto_limpo)

num_sentencas = len(sentencas)

for sentenca in sentencas:
    print(sentenca)

print(f"Número total de sentenças: {num_sentencas}")

with open('/content/drive/MyDrive/Classificador/sentencas_natural.txt', 'w') as f:
  for sentenca in sentencas:
    f.write(sentenca + '\n')

print("'/content/drive/MyDrive/Classificador/sentencas_natural.txt")

####**EMBARALHAMENTO E ESTRATIFICAÇÃO**

In [None]:
import random
import sklearn
from sklearn.model_selection import StratifiedShuffleSplit
from google.colab import drive

def carregar_sentencas(file_path):
    """Loads sentences from a file, assuming one sentence per line."""
    with open(file_path, 'r', encoding='utf-8') as file:
        sentences = [line.strip() for line in file]
    return sentences

drive.mount('/content/drive')

texto_artificial = '/content/drive/MyDrive/Classificador/sentencas_artificial.txt'
texto_natural = '/content/drive/MyDrive/Classificador/sentencas_natural.txt'

artificial_train = carregar_sentencas(texto_artificial)
natural_train = carregar_sentencas(texto_natural)

print(f"artificial: {len(artificial_train)}")
print(f"natural: {len(natural_train)}")

all_sents = [(sent, "artificial") for sent in artificial_train]
all_sents += [(sent, "natural") for sent in natural_train]

print(f"Dataset size = {len(all_sents)} sentences")

import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

values = [texto for (sent, texto) in all_sents]

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

strat_train_set = []
strat_pretest_set = []

for train_index, pretest_index in split.split(all_sents, values):
    strat_train_set = [all_sents[index] for index in train_index]
    strat_pretest_set = [all_sents[index] for index in pretest_index]

train_data = [[sent] for sent, _ in strat_train_set]
pretest_data = [[sent] for sent, _ in strat_pretest_set]

train_df = pd.DataFrame(train_data, columns=['Sentenca'])
pretest_df = pd.DataFrame(pretest_data, columns=['Sentenca'])

train_df['Classe'] = [autor for _, autor in strat_train_set]
pretest_df['Classe'] = [autor for _, autor in strat_pretest_set]

print("DataFrame de Treinamento:")
print(train_df.head())

print("\nDataFrame de Pré-Teste:")
print(pretest_df.head())

train_df.to_csv('/content/drive/MyDrive/Classificador/treinamento.csv', index=False)
pretest_df.to_csv('/content/drive/MyDrive/Classificador/preteste.csv', index=False)

####**PORCENTAGEM DE DADOS NOS CONJUNTOS DE TREINAMENTO E TESTE**

In [None]:
from google.colab import drive

from sklearn.model_selection import train_test_split


drive.mount('/content/drive')

def cat_proportions(data, cat):
    count = 0
    for item in data:
        if item[1] == cat:
            count += 1
    return float(count) / float(len(data))

def read_data_from_file(file_path, category):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append((line.strip(), category))
    return data

artificial_file_path = '/content/drive/MyDrive/Classificador/sentencas_artificial.txt'
natural_file_path = '/content/drive/MyDrive/Classificador/sentencas_natural.txt'

artificial_data = read_data_from_file(artificial_file_path, "texto_artificial")
natural_data = read_data_from_file(natural_file_path, "texto_natural")

all_sents = [(sent, "texto_artificial") for sent in artificial_data]
all_sents += [(sent, "texto_natural") for sent in natural_data]

texts = [item[0] for item in all_sents]
labels = [item[1] for item in all_sents]

strat_train_set, strat_pretest_set = train_test_split(
    list(zip(texts, labels)),
    test_size=0.2,
    stratify=labels
)

categories = ["texto_artificial", "texto_natural"]

rows = []
rows.append(["Category", "Overall", "Stratified train", "Stratified pretest"])

for cat in categories:
    rows.append([
        cat,
        f"{cat_proportions(all_sents, cat):.6f}",
        f"{cat_proportions(strat_train_set, cat):.6f}",
        f"{cat_proportions(strat_pretest_set, cat):.6f}"
    ])
def print_table(rows):
    column_widths = [max(len(str(item)) for item in col) for col in zip(*rows)]

    for row in rows:
        print(" | ".join(f"{str(row[i]):<{column_widths[i]}}" for i in range(len(row))))
        print("-" * (sum(column_widths) + 3 * (len(row) - 1)))

print_table(rows)

####**ESTATÍSTICAS SIMPLES**

In [None]:
import os
from google.colab import drive
import pandas as pd

drive.mount('/content/drive')

df = pd.read_csv('/content/drive/MyDrive/Classificador/preteste.csv')

def calcular_estatisticas(sentencas):
    num_chars = sum(len(sentence) for sentence in sentencas)
    num_words = sum(len(sentence.split()) for sentence in sentencas)
    num_sents = len(sentencas)
    num_vocab = len(set(word.lower() for sentence in sentencas for word in sentence.split()))

    media_caracteres_por_palavra = round(num_chars / num_words, 2) if num_words > 0 else 0
    media_palavras_por_sentenca = round(num_words / num_sents, 2) if num_sents > 0 else 0
    media_palavras_por_palavra_unica = round(num_words / num_vocab, 2) if num_vocab > 0 else 0

    return media_caracteres_por_palavra, media_palavras_por_sentenca, media_palavras_por_palavra_unica

df['media_caracteres_por_palavra'] = df['Sentenca'].apply(lambda x: calcular_estatisticas([x])[0])
df['media_palavras_por_sentenca'] = df['Sentenca'].apply(lambda x: calcular_estatisticas([x])[1])
df['media_palavras_por_palavra_unica'] = df['Sentenca'].apply(lambda x: calcular_estatisticas([x])[2])

df.to_csv('/content/drive/MyDrive/Classificador/teste/preteste_com_estatisticas.csv', index=False)



####**ETIQUETAGEM**


In [None]:
!pip install spacy
!python -m spacy download pt_core_news_sm
import pandas as pd
import spacy
from collections import Counter

nlp = spacy.load('pt_core_news_sm')

from google.colab import drive

drive.mount('/content/drive')

input_csv = '/content/drive/MyDrive/Classificador/preteste.csv'
df = pd.read_csv(input_csv)

print(df.head())

def extract_pos_features(sentenca):
    doc = nlp(sentenca)
    tag_freq = Counter(token.pos_ for token in doc)
    possible_tags = ['DET', 'NOUN', 'VERB', 'ADV', 'ADJ', 'ADP', 'PRON', 'AUX']
    features = {tag: tag_freq.get(tag, 0) for tag in possible_tags}
    return features

features_list = df['Sentenca'].apply(extract_pos_features)

features_df = pd.DataFrame(features_list.tolist())
features_df['Sentenca'] = df['Sentenca']
features_df['Classe'] = df['Classe']

columns = ['Sentenca', 'Classe'] + list(features_df.columns[:-2])
features_df = features_df[columns]

print(features_df.head())

output_csv = '/content/drive/MyDrive/Classificador/etiquetagem_preteste.csv'
features_df.to_csv(output_csv, index=False)

print(f'salvo em: {output_csv}')


####**CLASSIFICADOR BASEADO EM MEDIA DE CARACTERES POR PALAVRA, MEDIA DE PALAVRAS POR SENTENÇAS E MEDIA DE PALAVRAS POR PALAVRA ÚNICA**

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

df_treinamento = pd.read_csv('/content/drive/MyDrive/Classificador/teste/treinamento_com_estatisticas.csv')

df_teste = pd.read_csv('/content/drive/MyDrive/Classificador/teste/preteste_com_estatisticas.csv')

X_train = df_treinamento[['media_caracteres_por_palavra', 'media_palavras_por_sentenca', 'media_palavras_por_palavra_unica']]
y_train = df_treinamento['Classe']

X_test = df_teste[['media_caracteres_por_palavra', 'media_palavras_por_sentenca', 'media_palavras_por_palavra_unica']]
y_test = df_teste['Classe']

model = MultinomialNB()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Acurácia: {accuracy * 100:.2f}%')

print("classification_report:")
print(classification_report(y_test, y_pred))

print("confusion_matrix:")
print(confusion_matrix(y_test, y_pred))


####**CLASSIFICADOR BASEADO EM FREQUÊNCIA DE TAGS**

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

df_treinamento = pd.read_csv('/content/drive/MyDrive/Classificador/etiquetagem_treinamento.csv')

df_teste = pd.read_csv('/content/drive/MyDrive/Classificador/etiquetagem_preteste.csv')

X_train = df_treinamento[['DET', 'NOUN',	'VERB',	'ADV',	'ADJ',	'ADP',	'PRON',	'AUX']]
y_train = df_treinamento['Classe']

X_test = df_teste[['DET', 'NOUN',	'VERB',	'ADV',	'ADJ',	'ADP',	'PRON',	'AUX']]
y_test = df_teste['Classe']

model = MultinomialNB()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Acurácia: {accuracy * 100:.2f}%')

print("classification_report:")
print(classification_report(y_test, y_pred))

print("confusion_matrix:")
print(confusion_matrix(y_test, y_pred))

####**CLASSIFICADOR BASEADO EM BAG OF WORDS**

In [None]:
import pandas as pd
from google.colab import drive
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

drive.mount('/content/drive')

input_train_csv = '/content/drive/MyDrive/Classificador/treinamento.csv'
input_test_csv = '/content/drive/MyDrive/Classificador/preteste.csv'
df_train = pd.read_csv(input_train_csv)
df_test = pd.read_csv(input_test_csv)

print("Dados de treinamento:")
print(df_train.head())
print("\nDados de teste:")
print(df_test.head())

sentences_train = df_train['Sentenca'].tolist()
labels_train = df_train['Classe'].tolist()

sentences_test = df_test['Sentenca'].tolist()
labels_test = df_test['Classe'].tolist()

vectorizer = CountVectorizer(stop_words=None, max_features=10000)
X_train = vectorizer.fit_transform(sentences_train)
X_test = vectorizer.transform(sentences_test)

nb_model = MultinomialNB()
nb_model.fit(X_train, labels_train)

y_pred = nb_model.predict(X_test)

accuracy = accuracy_score(labels_test, y_pred)
print(f'Acurácia: {accuracy:.4f}')

print('classification_report:')
print(classification_report(labels_test, y_pred))

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(labels_test, y_pred)

print(cm)

####**CLASSIFICADOR BASEADO EM TF-IDF**

In [None]:
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

input_train_csv = '/content/drive/MyDrive/Classificador/treinamento.csv'
df_train = pd.read_csv(input_train_csv)

input_test_csv = '/content/drive/MyDrive/Classificador/preteste.csv'
df_test = pd.read_csv(input_test_csv)

print("Dados de treinamento:")
print(df_train.head())
print("\nDados de teste:")
print(df_test.head())

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(df_train['Sentenca'])
y_train = df_train['Classe']

X_test_tfidf = vectorizer.transform(df_test['Sentenca'])
y_test = df_test['Classe']

from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()

nb_model.fit(X_train_tfidf, y_train)

from sklearn.model_selection import GridSearchCV

param_grid = {
    'alpha': [0.1, 0.5, 1.0, 2.0, 5.0],
    'fit_prior': [True, False]
    }

grid_search = GridSearchCV(estimator=nb_model, param_grid=param_grid, scoring='accuracy', cv=5)

grid_search.fit(X_train_tfidf, y_train)

print("grid_search.best_params:", grid_search.best_params_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_tfidf)

from sklearn.metrics import accuracy_score, classification_report

y_pred = nb_model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f'Acurácia: {accuracy:.4f}')

print('classification_report:')
print(classification_report(y_test, y_pred))

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

print(cm)