In [18]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
!pip install openai
!pip install openai==0.28
!pip install keras
!pip install networkx

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




In [19]:
import numpy as np
import nltk
from collections import Counter
from nltk.util import bigrams
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
import networkx as nx
import string
import pickle
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
import openai

def sentence_length(text):
    sentences = nltk.sent_tokenize(text)
    if not sentences:  # Handle case where there are no sentences in the text
        return 0.0  # Return 0 if there are no sentences
    lengths = [len(sentence.split()) for sentence in sentences]
    min_length = min(lengths)
    max_length = max(lengths)
    if min_length == max_length:  # Handle case where all sentences have the same length
        return 0.0  # Return 0 if all sentences have the same length
    avg_sentence_length = sum(lengths) / len(sentences)
    normalized_length = (avg_sentence_length - min_length) / (max_length - min_length)
    return normalized_length

# Function to calculate punctuation frequency feature vector
def calculate_punctuation_frequency(text):
    punctuation_marks = set(string.punctuation)
    punctuation_counts = Counter(char for char in text if char in punctuation_marks)
    total_punctuation = sum(punctuation_counts.values())
    punctuation_distribution = {punct: count / total_punctuation for punct, count in punctuation_counts.items()}
    return [punctuation_distribution.get(mark, 0) for mark in punctuation_marks]

# Function to calculate POS tag frequency feature vector
def calculate_pos_tag_frequency(text):
    tokens = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    pos_tag_counts = Counter(tag for word, tag in pos_tags)
    total_pos_tags = sum(pos_tag_counts.values())
    pos_tag_distribution = {tag: count / total_pos_tags for tag, count in pos_tag_counts.items()}
    all_tags = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']
    return [pos_tag_distribution.get(tag, 0) for tag in all_tags]

# Function to calculate function word frequency feature vector
def calculate_function_word_frequency(text):
    stop_words = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(text)
    function_words_text = [word for word in tokens if word.lower() in stop_words]
    total_function_words = len(function_words_text)
    function_word_counts = Counter(function_words_text)
    function_word_frequencies = {word: count / total_function_words for word, count in function_word_counts.items()}
    all_function_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now']
    return [function_word_frequencies.get(word, 0) for word in all_function_words]
def is_passive_voice(tagged_sentence):
    for i in range(1, len(tagged_sentence)):
        if (
            tagged_sentence[i][0] == "by" and
            tagged_sentence[i - 1][1].startswith("V") and
            tagged_sentence[i][1] == "IN"
        ):
            return True
    return False

def calculate_passive_to_active_ratio(text):
    sentences = sent_tokenize(text)
    total_sentences = len(sentences)
    passive_count = 0
    active_count = 0

    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        tagged_sentence = nltk.pos_tag(words)

        if is_passive_voice(tagged_sentence):
            passive_count += 1
        else:
            active_count += 1

    return passive_count / active_count if active_count > 0 else 0

# Function to convert is_passive_voice result to binary feature vector
def is_passive_to_binary(passive_to_active_ratio):
    return 1 if passive_to_active_ratio > 1 else 0

def ngram_transition_graph_feature(text, n=5):
    tokens = nltk.word_tokenize(text)
    ngrams = list(nltk.ngrams(tokens, n))
    transition_graph = nx.DiGraph()
    transition_graph.add_nodes_from(ngrams)
    for i in range(len(ngrams) - 1):
        transition_graph.add_edge(ngrams[i], ngrams[i + 1])

    # Compute graph properties
    num_nodes = transition_graph.number_of_nodes()
    num_edges = transition_graph.number_of_edges()
    avg_degree = np.mean([val for (node, val) in transition_graph.degree()])
    density = nx.density(transition_graph)

    # Return computed graph properties as a feature vector
    return np.array([num_nodes, num_edges, avg_degree, density])

def type_token_ratio(text):
    tokens = nltk.word_tokenize(text)
    unique_tokens = set(tokens)
    return len(unique_tokens) / len(tokens)

# Gender prediction model
def predict_gender(text):
    # Load tokenizer
    with open("lstm_tokenizer.pickle", "rb") as handle:
        tokenizer = pickle.load(handle)

    # Load label encoder
    with open("lstm_label_encoder.pickle", "rb") as handle:
        label_encoder = pickle.load(handle)

    # Load max length
    with open("max_length.pickle", "rb") as handle:
        max_length = pickle.load(handle)

    # Load model
    model = load_model("lstm_trained_model.h5")

    # Tokenize input text
    new_data_sequence = tokenizer.texts_to_sequences([text])

    # Pad tokenized sequence
    new_data_padded = pad_sequences(new_data_sequence, maxlen=max_length)

    # Make predictions
    prediction = model.predict(new_data_padded)
    predicted_class = (prediction > 0.5).astype('int')[0][0]

    # Return predicted gender as numeric value - (0 for male, 1 for female)
    return predicted_class

# American or British detection using gpt 3.5
def detect_english_variant(text):
    prompt = "Please analyze the language and phrasing of the paragraph provided below and determine whether it aligns more closely with American English or British English.\n\n" + text + "\n\nLanguage variant:"

    # Set up OpenAI API
    openai.api_key = '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'

    # Use GPT-3.5 to determine English variant
    response = openai.Completion.create(
      engine="gpt-3.5-turbo-instruct",
      prompt=prompt,
      temperature=0,
      max_tokens=800
    )

    # Extracting the prediction from the response
    prediction_text = response.choices[0].text.strip()

    # Assign numeric values to the outcomes
    if prediction_text == "American English":
        return 1
    elif prediction_text == "British English":
        return 2
    else:
        return 0  # Return 0 for other cases or errors

# Function to check for double spaces after a full stop
def check_double_spaces_after_full_stop(text):
    double_spaces_count = text.count(".  ")
    if double_spaces_count >= 3:
        return 1
    else:
        return 0


# Function to calculate all features
def calculate_all_features(text):
    features = []

    # Sentence Length
    features.append(sentence_length(text))

    # Punctuation Frequency
    features.extend(calculate_punctuation_frequency(text))

    # POS Tag Frequency
    features.extend(calculate_pos_tag_frequency(text))

    # Function Word Frequency
    features.extend(calculate_function_word_frequency(text))

    # N-gram Transition Graph Feature
    features.extend(ngram_transition_graph_feature(text))

    # Type-Token Ratio
    features.append(type_token_ratio(text))

    # Passive to Active Ratio
    passive_to_active_ratio = calculate_passive_to_active_ratio(text)
    features.append(is_passive_to_binary(passive_to_active_ratio))

    # Gender prediction
    gender_prediction = predict_gender(text)
    features.append(gender_prediction)

    # American or British detection using GPT 3.5
    #english_variant = detect_english_variant(text)
    #features.append(english_variant)

    # Check for double spaces after a full stop
    double_spaces = check_double_spaces_after_full_stop(text)
    features.append(double_spaces)

    return features


In [20]:
def compute_similarity(feature_set1, feature_set2):
    similarities = {}

    # Sentence Length
    similarities['sentence_length'] = max(0, 1 - abs(feature_set1[0] - feature_set2[0]))

    # Punctuation Frequency
    punctuation_freq1 = feature_set1[1:33]
    punctuation_freq2 = feature_set2[1:33]
    for i in range(len(punctuation_freq1)):
        similarities['punctuation_{}'.format(i+1)] = max(0, 1 - abs(punctuation_freq1[i] - punctuation_freq2[i]))

    # POS Tag Frequency
    pos_tag_freq1 = feature_set1[33:69]
    pos_tag_freq2 = feature_set2[33:69]
    for i, tag in enumerate(['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']):
        similarities['pos_tag_{}'.format(tag)] = max(0, 1 - abs(pos_tag_freq1[i] - pos_tag_freq2[i]))

    # Function Word Frequency
    function_word_freq1 = feature_set1[69:196]
    function_word_freq2 = feature_set2[69:196]
    for i, word in enumerate(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now']):
        similarities['function_word_{}'.format(word)] = max(0, 1 - abs(function_word_freq1[i] - function_word_freq2[i]))

    # N-gram Transition Graph Feature
    ngram_transition_feature1 = feature_set1[196:200]
    ngram_transition_feature2 = feature_set2[196:200]
    for i, feature in enumerate(['num_nodes', 'num_edges', 'avg_degree', 'density']):
        similarities['ngram_transition_{}'.format(feature)] = max(0, 1 - abs(ngram_transition_feature1[i] - ngram_transition_feature2[i]))

    # Type-Token Ratio
    similarities['type_token_ratio'] = max(0, 1 - abs(feature_set1[200] - feature_set2[200]))

    # Passive to Active Ratio
    similarities['passive_to_active_ratio'] = max(0, 1 - abs(feature_set1[201] - feature_set2[201]))

    # Gender prediction
    similarities['gender_prediction'] = 1 if feature_set1[202] == feature_set2[202] else 0

    # English variant
    #similarities['english_variant'] = 1 if feature_set1[203] == feature_set2[203] else 0

    # Double spaces after full stop
    similarities['double_spaces'] = 1 if feature_set1[203] == feature_set2[203] else 0

    return similarities


In [21]:
# Preprocess the similarity values to get the required key-value pairs
def preprocess(similarity_dict):
    fixed_indexes = [0, 3, 10, 13, 23, 29, 32, 200, 202, 203]  # Specify the indexes you want to extract
    keys = list(similarity_dict.keys())
    values = list(similarity_dict.values())
    extracted_keys = [keys[i] for i in fixed_indexes]
    extracted_values = [values[i] for i in fixed_indexes]
    return extracted_keys, extracted_values

In [22]:
import pandas as pd
import pickle

# Define suspect and anonymous texts
suspect_text = "Hi I'm weird lolll. Hi hwgohehd"
anonymous_text = "This weirdo thinks he can kill me"

# Extract features for the suspect text
suspect_features = calculate_all_features(suspect_text)

# Extract features for the anonymous text
anonymous_features = calculate_all_features(anonymous_text)

# Compute similarity between suspect and anonymous text
new_similarity_values = compute_similarity(suspect_features, anonymous_features)

# Get preprocessed keys and values
preprocessed_keys, preprocessed_values = preprocess(new_similarity_values)

# Create a DataFrame using the preprocessed values
new_data = pd.DataFrame([preprocessed_values], columns=[preprocessed_keys])

print(preprocessed_values)
print(new_similarity_values)
new_data.head()

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


[0.5, 1, 1, 1, 1, 1, 0.5, 0.875, 1, 1]
{'sentence_length': 0.5, 'punctuation_1': 1, 'punctuation_2': 1, 'punctuation_3': 1, 'punctuation_4': 1, 'punctuation_5': 1, 'punctuation_6': 1, 'punctuation_7': 1, 'punctuation_8': 0.5, 'punctuation_9': 1, 'punctuation_10': 1, 'punctuation_11': 1, 'punctuation_12': 1, 'punctuation_13': 1, 'punctuation_14': 1, 'punctuation_15': 1, 'punctuation_16': 1, 'punctuation_17': 1, 'punctuation_18': 1, 'punctuation_19': 1, 'punctuation_20': 1, 'punctuation_21': 1, 'punctuation_22': 1, 'punctuation_23': 1, 'punctuation_24': 1, 'punctuation_25': 1, 'punctuation_26': 1, 'punctuation_27': 1, 'punctuation_28': 1, 'punctuation_29': 1, 'punctuation_30': 1, 'punctuation_31': 1, 'punctuation_32': 0.5, 'pos_tag_CC': 1, 'pos_tag_CD': 1, 'pos_tag_DT': 0.8571428571428572, 'pos_tag_EX': 1, 'pos_tag_FW': 1, 'pos_tag_IN': 1, 'pos_tag_JJ': 0.9821428571428572, 'pos_tag_JJR': 1, 'pos_tag_JJS': 1, 'pos_tag_LS': 1, 'pos_tag_MD': 0.8571428571428572, 'pos_tag_NN': 0.75, 'pos_tag_

Unnamed: 0,sentence_length,punctuation_3,punctuation_10,punctuation_13,punctuation_23,punctuation_29,punctuation_32,type_token_ratio,gender_prediction,double_spaces
0,0.5,1,1,1,1,1,0.5,0.875,1,1


In [23]:
# Load the model from the pickle file
with open('random_forest_model.pickle', 'rb') as f:
    model_info = pickle.load(f)

# Make predictions and obtain predicted probabilities
y_pred_proba_new = model_info['model'].predict_proba(new_data)
y_pred_new = model_info['model'].predict(new_data)
probability_class_1 = y_pred_proba_new[:, 1]
print("Probability of class 1:", probability_class_1)
# Display predictions and probabilities
print("Predictions:", y_pred_new)

Probability of class 1: [0.70909091]
Predictions: [1]


