In [33]:
import matplotlib.pyplot as plt
import numpy as np
import os
import PIL
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
import numpy as np
import pandas as pd
import re
from cleantext import clean
import string
from math import exp
from random import seed
from random import random
from random import randrange
from csv import reader

In [34]:
#Use for models 1 and 2
img_height = 300
img_width = 300

def create_model_basic():
    model = Sequential([
    layers.Rescaling(1./255, input_shape=(img_height, img_width, 3)),
    layers.Conv2D(16, 3, padding='same', activation='relu'),
    layers.MaxPooling2D(),
    layers.Conv2D(32, 3, padding='same', activation='relu'),
    layers.MaxPooling2D(),
    layers.Conv2D(64, 3, padding='same', activation='relu'),
    layers.MaxPooling2D(),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dense(2)
  ])

    model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

    return model

# Use for models 3 and 4
def create_model_advanced():
    data_augmentation = keras.Sequential([
    layers.RandomFlip("horizontal",
                      input_shape=(img_height, img_width,3)),
    layers.RandomRotation(0.1),
  ])

    model = Sequential([
    data_augmentation,
    layers.Rescaling(1./255),
    layers.Conv2D(16, 3, padding='same', activation='relu'),
    layers.MaxPooling2D(),
    layers.Conv2D(32, 3, padding='same', activation='relu'),
    layers.MaxPooling2D(),
    layers.Conv2D(64, 3, padding='same', activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.2),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dense(2)
  ])

    model.compile(optimizer='adam',
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=['accuracy'])
    return model

model = create_model_basic()
#model = create_model_advanced()

# Load in model 1
model1 = create_model_basic()
checkpoint_path = "training_1/cp.ckpt"
model1.load_weights(checkpoint_path)

# Load in model 2
model2 = create_model_advanced()
checkpoint_path = "training_2/cp.ckpt"
model2.load_weights(checkpoint_path)



<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fd0a6b81040>

In [35]:
#function to classify images 
def evaluate_picture(model, filename):
    try: 
        img_path = filename
#         print(img_path)
        img = tf.keras.utils.load_img(
            img_path, target_size=(img_height, img_width)
      )
        img_array = tf.keras.utils.img_to_array(img)
        img_array = tf.expand_dims(img_array, 0) # Create a batch
        predictions = model.predict(img_array)
        score = tf.nn.softmax(predictions[0])
        class_names = ['bots', 'people']
#         print(
#           "This image most likely belongs to {} with a {:.2f} percent confidence."
#           .format(class_names[np.argmax(score)], 100 * np.max(score))
#       )
        result = class_names[np.argmax(score)]
        if result == 'bots':
            return 1
        return 0
    except: 
        return np.nan 


In [36]:
import re 
import pandas as pd
try:
    import cPickle as pickle
except ImportError:
    import pickle  
import numpy as np

with open('Emoji_Dict.p', 'rb') as fp:
    Emoji_Dict = pickle.load(fp)
Emoji_Dict = {v: k for k, v in Emoji_Dict.items()}

def convert_emojis_to_word(text):
    for emot in Emoji_Dict:
        text = re.sub(r'('+emot+')', "_".join(Emoji_Dict[emot].replace(",","").replace(":","").split()), text)
    return text
#instead of removing emojis, converts them to text 
df=pd.read_excel('Copy of Bot Data.xlsx')

#convert text to lower case 
df['biography']=df['biography'].str.lower()
df['comment']=df['comment'].str.lower()

#expand contractions 
contractions_dict = {"aren't": "are not", "don't": "do not", "Don't": "do not", "I'm": "I am", "i'm": "I am", 
                    "it's": "it is", "y'all": "you all", "Y'all": "you all", "didn't": "did not", "won't": "will not",
                   "I'll": "I will", "i'll": "I will", "can't": "can not"}
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

def expand_contractions(text, expand_dict):
    def replace(match):
        return expand_dict[match.group(0)]
    return contractions_re.sub(replace, text)
df['biography']=df['biography'].apply(lambda x:expand_contractions(x, contractions_dict ))
df['comment']=df['comment'].apply(lambda x:expand_contractions(x, contractions_dict))

bios=df['biography'].astype(str)
usernames=df['Username']
comments=df['comment']
new_bios=[]
for b in bios: 
    temp=convert_emojis_to_word(b)
    new_bios.append(temp)
new_comments=[]
for c in comments:
    temp=convert_emojis_to_word(c)
    new_comments.append(temp)
df_filtered=df.copy()
df_filtered['biography']=new_bios
df_filtered['comment']=new_comments
from cleantext import clean

new_bios=[]
for b in df_filtered['biography']: 
    temp=clean(b, no_emoji=True)
    temp=temp.replace("\n", " ")
    if temp=='':
        new_bios.append('None')
    else: 
        new_bios.append(temp)
new_comments=[]
for c in df_filtered['comment']:
    temp=clean(c, no_emoji=True)
    temp=temp.replace("\n", " ")
    if temp=='':
        new_comments.append('None')
    else: 
        new_comments.append(temp)

df_filtered['biography']=new_bios
df_filtered['comment']=new_comments
df_filtered['combined']=df_filtered['biography'] + ' ' + df_filtered['comment']

In [37]:
df_filtered=df_filtered.sample(frac=1, random_state=10)
image_classifications=[]
usernames=df_filtered['Username']
for user in usernames:
    filename=user+'.png'
    classification=evaluate_picture(model1, filename)
    image_classifications.append(classification)
real_values=df_filtered['bot classification (0-not a bot, 1-bot)']
df_filtered=df_filtered.drop(['bot classification (0-not a bot, 1-bot)'], axis=1)
df_filtered['image_value']=image_classifications
df_filtered['classification']=real_values





Unnamed: 0,Username,Follower Count,Following Count,Follower/Following Ratio,Number of posts,biography,# of likes on comment,comment,time of comment after post (minutes),combined,image_value,classification
170,metalheadkjk,2372,297,7.986532,25,i'm kurt kennedy from portland tennessee!! i l...,0,the nashville predators suck,12,i'm kurt kennedy from portland tennessee!! i l...,0.0,0
282,dsreis,912,2412,0.378109,295,brasileiro | lds | married to the love of my l...,440,custody papers. they were never married.,3,brasileiro | lds | married to the love of my l...,0.0,0
132,ulan_sevbeni,3401,108,31.490741,0,umudumu kaybettim:)),0,"for a start i deposited $15,000 to test thewat...",2,umudumu kaybettim:)) for a start i deposited $...,1.0,1
65,earlenemega,26,268,0.097015,0,linkr.bio/earlenemega,96,hushed_facetag your friends who do not have a ...,0,linkr.bio/earlenemega hushed_facetag your frie...,1.0,1
24,catherine.wilson_,7,35,0.200000,2,backup-file-2861ba.netlify.app,0,do not watch my stories !!,2,backup-file-2861ba.netlify.app do not watch my...,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
156,jesilakunil67,34,238,0.142857,0,clinquant-cranachan-595473.netlify.app,23,"how about my stories,keep it or leave it?",1,clinquant-cranachan-595473.netlify.app how abo...,1.0,1
123,harris_gilpin,2532,1504,1.683511,9,od fearingdove private figure ! manmedium-dark...,0,"i am funding heavy_dollar_sign5,000 for the fi...",1,od fearingdove private figure ! manmedium-dark...,0.0,1
15,gabriella.jaz11,4,39,0.102564,0,gabriella new account help support beating_hea...,1,what do you guys do when you take a shower? fa...,0,gabriella new account help support beating_hea...,1.0,1
125,clrkkk.amanda93,571,4,142.750000,3,linkr.bio/amanda-clark,163,charmingly!,1,linkr.bio/amanda-clark charmingly!,1.0,1


In [46]:
#classifying the biographies and comments combined 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer


df_filtered=df_filtered.dropna()

vectorizer1 =  CountVectorizer(min_df=0, lowercase=False)
vectorizer1.fit(df_filtered['combined'])
text_array = vectorizer1.transform(df_filtered['combined']).toarray()
text_df = pd.DataFrame(data=text_array,columns = vectorizer1.get_feature_names())
df_filtered

from sklearn.feature_selection import RFE

model = LogisticRegression(max_iter=1000)
selector1=RFE(estimator=model, n_features_to_select=50, step=1) #chose arbitrarily to select 50 of the most important words out of 2000 to make it more manageable

#split dataset into features and labels
Y1=df_filtered['classification']
X1=text_df

#run RFE
rfe1 = selector1.fit_transform(X1, Y1)
filter1=(selector1.get_support())

##
#filter columns using data from RFE
filter1=list(filter1)
current_cols = list(X1.columns)

#figure out which columns to keep
important_cols=[]
for index in range(len(filter1)):
    if (filter1[index])==True:
        important_cols.append(current_cols[index])
text_df=text_df[important_cols]
df_filtered=df_filtered.reset_index()
df_combined=pd.concat([text_df, df_filtered], axis=1)
df_combined.replace([np.inf, -np.inf], np.nan, inplace=True)
df_combined=df_combined.drop(['comment', 'biography', 'combined', 'Username'], axis=1)
df_combined

Unnamed: 0,000,100,account,after,ain,app,athlete,bf,bio,bit,...,youtube,index,Follower Count,Following Count,Follower/Following Ratio,Number of posts,# of likes on comment,time of comment after post (minutes),image_value,classification
0,0,0,0,0,0,0,0,0,0,0,...,0,170,2372,297,7.986532,25,0,12,0.0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,282,912,2412,0.378109,295,440,3,0.0,0
2,1,1,0,1,0,0,0,0,0,0,...,0,132,3401,108,31.490741,0,0,2,1.0,1
3,0,0,0,0,0,0,0,0,1,0,...,0,65,26,268,0.097015,0,96,0,1.0,1
4,0,0,0,0,0,1,0,0,0,0,...,0,24,7,35,0.200000,2,0,2,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282,0,0,0,0,0,1,0,0,0,0,...,0,156,34,238,0.142857,0,23,1,1.0,1
283,1,0,0,0,0,0,0,0,0,0,...,0,123,2532,1504,1.683511,9,0,1,0.0,1
284,0,0,1,0,0,0,0,0,0,0,...,0,15,4,39,0.102564,0,1,0,1.0,1
285,0,0,0,0,0,0,0,0,1,0,...,0,125,571,4,142.750000,3,163,1,1.0,1


In [47]:
df_combined.to_csv('neural_network_input.csv')

In [48]:
#neural network 
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

def dataset_minmax(dataset):
    minmax = list()
    stats = [[min(column), max(column)] for column in zip(*dataset)]
    return stats

def normalize_dataset(dataset, minmax):
    for row in dataset:
        for i in range(len(row)-1):
            row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])
            
from math import exp
from random import seed
from random import random
 
# Initialize a network
def initialize_network(n_inputs, n_hidden, n_outputs):
    network = list()
    hidden_layer = [{'weights':[random() for i in range(n_inputs + 1)]} for i in range(n_hidden)]
    network.append(hidden_layer)
    output_layer = [{'weights':[random() for i in range(n_hidden + 1)]} for i in range(n_outputs)]
    network.append(output_layer)
    return network
 
# Calculate neuron activation for an input
def activate(weights, inputs):
    activation = weights[-1]
    for i in range(len(weights)-1):
        activation += weights[i] * inputs[i]
    return activation
 
# Transfer neuron activation
def transfer(activation):
    return 1.0 / (1.0 + exp(-activation))
 
# Forward propagate input to a network output
def forward_propagate(network, row):
    inputs = row
    for layer in network:
        new_inputs = []
        for neuron in layer:
            activation = activate(neuron['weights'], inputs)
            neuron['output'] = transfer(activation)
            new_inputs.append(neuron['output'])
        inputs = new_inputs
    return inputs
 
# Calculate the derivative of an neuron output
def transfer_derivative(output):
    return output * (1.0 - output)
 
# Backpropagate error and store in neurons
def backward_propagate_error(network, expected):
    for i in reversed(range(len(network))):
        layer = network[i]
        errors = list()
        if i != len(network)-1:
            for j in range(len(layer)):
                error = 0.0
                for neuron in network[i + 1]:
                    error += (neuron['weights'][j] * neuron['delta'])
                errors.append(error)
        else:
            for j in range(len(layer)):
                neuron = layer[j]
                errors.append(neuron['output'] - expected[j])
        for j in range(len(layer)):
            neuron = layer[j]
            neuron['delta'] = errors[j] * transfer_derivative(neuron['output'])
            
def update_weights(network, row, l_rate):
    for i in range(len(network)):
        inputs = row[:-1]
        if i != 0:
            inputs = [neuron['output'] for neuron in network[i - 1]]
        for neuron in network[i]:
            for j in range(len(inputs)):
                neuron['weights'][j] -= l_rate * neuron['delta'] * inputs[j]
            neuron['weights'][-1] -= l_rate * neuron['delta']

# Train a network for a fixed number of epochs
def train_network(network, train, l_rate, n_epoch, n_outputs):
    for epoch in range(n_epoch):
        sum_error = 0
        for row in train:
            outputs = forward_propagate(network, row)
            expected = [0 for i in range(n_outputs)]
            expected[row[-1]] = 1
            sum_error += sum([(expected[i]-outputs[i])**2 for i in range(len(expected))])
            backward_propagate_error(network, expected)
            update_weights(network, row, l_rate)
        print('>epoch=%d, lrate=%.3f, error=%.3f' % (epoch, l_rate, sum_error))

# Make a prediction with a network
def predict(network, row):
    outputs = forward_propagate(network, row)
    return outputs.index(max(outputs))

In [56]:
dataset = load_csv('neural_network_input.csv')

for i in range(len(dataset[0])-1):
    str_column_to_float(dataset, i)

str_column_to_int(dataset, len(dataset[0])-1)
minmax = dataset_minmax(dataset)
normalize_dataset(dataset, minmax)

train=dataset[:230]
test=dataset[230:]

#put everything together
n_folds = 5
learning_rate = 0.1
n_epoch = 500
n_layers = 5
n_inputs = len(train[0]) - 1
n_outputs = len(set([row[-1] for row in train]))
network = initialize_network(n_inputs, n_layers, n_outputs)
train_network(network, train, learning_rate, n_epoch, n_outputs)



>epoch=0, lrate=0.100, error=136.754
>epoch=1, lrate=0.100, error=114.886
>epoch=2, lrate=0.100, error=110.117
>epoch=3, lrate=0.100, error=101.270
>epoch=4, lrate=0.100, error=85.387
>epoch=5, lrate=0.100, error=64.741
>epoch=6, lrate=0.100, error=46.384
>epoch=7, lrate=0.100, error=33.624
>epoch=8, lrate=0.100, error=25.574
>epoch=9, lrate=0.100, error=20.482
>epoch=10, lrate=0.100, error=17.112
>epoch=11, lrate=0.100, error=14.755
>epoch=12, lrate=0.100, error=13.018
>epoch=13, lrate=0.100, error=11.678
>epoch=14, lrate=0.100, error=10.605
>epoch=15, lrate=0.100, error=9.717
>epoch=16, lrate=0.100, error=8.965
>epoch=17, lrate=0.100, error=8.314
>epoch=18, lrate=0.100, error=7.743
>epoch=19, lrate=0.100, error=7.234
>epoch=20, lrate=0.100, error=6.776
>epoch=21, lrate=0.100, error=6.361
>epoch=22, lrate=0.100, error=5.982
>epoch=23, lrate=0.100, error=5.635
>epoch=24, lrate=0.100, error=5.315
>epoch=25, lrate=0.100, error=5.019
>epoch=26, lrate=0.100, error=4.745
>epoch=27, lrate=0.

>epoch=225, lrate=0.100, error=0.215
>epoch=226, lrate=0.100, error=0.214
>epoch=227, lrate=0.100, error=0.212
>epoch=228, lrate=0.100, error=0.211
>epoch=229, lrate=0.100, error=0.210
>epoch=230, lrate=0.100, error=0.209
>epoch=231, lrate=0.100, error=0.208
>epoch=232, lrate=0.100, error=0.207
>epoch=233, lrate=0.100, error=0.206
>epoch=234, lrate=0.100, error=0.205
>epoch=235, lrate=0.100, error=0.204
>epoch=236, lrate=0.100, error=0.203
>epoch=237, lrate=0.100, error=0.202
>epoch=238, lrate=0.100, error=0.201
>epoch=239, lrate=0.100, error=0.200
>epoch=240, lrate=0.100, error=0.199
>epoch=241, lrate=0.100, error=0.198
>epoch=242, lrate=0.100, error=0.197
>epoch=243, lrate=0.100, error=0.196
>epoch=244, lrate=0.100, error=0.195
>epoch=245, lrate=0.100, error=0.194
>epoch=246, lrate=0.100, error=0.193
>epoch=247, lrate=0.100, error=0.192
>epoch=248, lrate=0.100, error=0.191
>epoch=249, lrate=0.100, error=0.190
>epoch=250, lrate=0.100, error=0.189
>epoch=251, lrate=0.100, error=0.188
>

>epoch=448, lrate=0.100, error=0.095
>epoch=449, lrate=0.100, error=0.094
>epoch=450, lrate=0.100, error=0.094
>epoch=451, lrate=0.100, error=0.094
>epoch=452, lrate=0.100, error=0.094
>epoch=453, lrate=0.100, error=0.093
>epoch=454, lrate=0.100, error=0.093
>epoch=455, lrate=0.100, error=0.093
>epoch=456, lrate=0.100, error=0.093
>epoch=457, lrate=0.100, error=0.092
>epoch=458, lrate=0.100, error=0.092
>epoch=459, lrate=0.100, error=0.092
>epoch=460, lrate=0.100, error=0.092
>epoch=461, lrate=0.100, error=0.091
>epoch=462, lrate=0.100, error=0.091
>epoch=463, lrate=0.100, error=0.091
>epoch=464, lrate=0.100, error=0.091
>epoch=465, lrate=0.100, error=0.091
>epoch=466, lrate=0.100, error=0.090
>epoch=467, lrate=0.100, error=0.090
>epoch=468, lrate=0.100, error=0.090
>epoch=469, lrate=0.100, error=0.090
>epoch=470, lrate=0.100, error=0.089
>epoch=471, lrate=0.100, error=0.089
>epoch=472, lrate=0.100, error=0.089
>epoch=473, lrate=0.100, error=0.089
>epoch=474, lrate=0.100, error=0.089
>

In [57]:
counter=0
for row in test:
    prediction = predict(network, row)
    actual=row[-1]
    if prediction==actual:
        counter+=1
print("The accuracy is " + str(counter/len(test)*100) + '%')

The accuracy is 100.0%


In [58]:
import pickle
pickle.dump(network , open( 'combined_network_weights.pkl' , 'wb' ) )